From 8fac58a60806471736a4074c1ded1cc5752364a5 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Wed, 22 Jan 2025 17:22:25 +0100 Subject: [PATCH 1/8] Deprecate and replace CUB iterators existing in Thrust We keep CUB iterators inside the CUB implementation headers, because some of those are exposed to NVRTC which cannot compile Thrust iterators yet. --- .../bench/run_length_encode/encode.cu | 4 +++- cub/cub/agent/agent_reduce_by_key.cuh | 1 - cub/cub/agent/agent_rle.cuh | 1 - cub/cub/agent/agent_segment_fixup.cuh | 1 - cub/cub/agent/agent_spmv_orig.cuh | 6 ++++++ cub/cub/device/device_run_length_encode.cuh | 2 ++ .../device/dispatch/dispatch_spmv_orig.cuh | 2 ++ .../dispatch/dispatch_streaming_reduce.cuh | 2 ++ cub/cub/iterator/constant_input_iterator.cuh | 4 +++- cub/cub/iterator/counting_input_iterator.cuh | 4 +++- cub/cub/iterator/discard_output_iterator.cuh | 4 +++- cub/cub/iterator/transform_input_iterator.cuh | 4 +++- .../catch2_test_block_run_length_decode.cu | 9 +++++---- cub/test/catch2_test_block_store.cu | 1 - cub/test/catch2_test_device_for.cu | 4 ++-- cub/test/catch2_test_device_for_copy.cu | 4 ++-- cub/test/catch2_test_device_histogram.cu | 7 ++++--- cub/test/catch2_test_device_reduce.cuh | 8 ++++---- cub/test/catch2_test_iterator.cu | 20 ++++++++++++++++++- cub/test/catch2_test_util_type.cu | 9 +++++---- cub/test/test_device_batch_memcpy.cu | 6 +++--- cub/test/test_util.h | 5 +++-- 22 files changed, 74 insertions(+), 34 deletions(-) diff --git a/cub/benchmarks/bench/run_length_encode/encode.cu b/cub/benchmarks/bench/run_length_encode/encode.cu index 82e73f4a87b..9a62b073e75 100644 --- a/cub/benchmarks/bench/run_length_encode/encode.cu +++ b/cub/benchmarks/bench/run_length_encode/encode.cu @@ -27,6 +27,8 @@ #include +#include + #include #include @@ -74,7 +76,7 @@ static void rle(nvbench::state& state, nvbench::type_list) using offset_t = OffsetT; using keys_input_it_t = const T*; using unique_output_it_t = T*; - using vals_input_it_t = cub::ConstantInputIterator; + using vals_input_it_t = thrust::constant_iterator; using aggregate_output_it_t = offset_t*; using num_runs_output_iterator_t = offset_t*; using equality_op_t = ::cuda::std::equal_to<>; diff --git a/cub/cub/agent/agent_reduce_by_key.cuh b/cub/cub/agent/agent_reduce_by_key.cuh index ac0d9045ab9..a90399f4325 100644 --- a/cub/cub/agent/agent_reduce_by_key.cuh +++ b/cub/cub/agent/agent_reduce_by_key.cuh @@ -49,7 +49,6 @@ #include #include #include -#include #include diff --git a/cub/cub/agent/agent_rle.cuh b/cub/cub/agent/agent_rle.cuh index f8898fa4281..2ea0729db92 100644 --- a/cub/cub/agent/agent_rle.cuh +++ b/cub/cub/agent/agent_rle.cuh @@ -52,7 +52,6 @@ #include #include #include -#include #include #include diff --git a/cub/cub/agent/agent_segment_fixup.cuh b/cub/cub/agent/agent_segment_fixup.cuh index caabf774ba8..717b9b115e9 100644 --- a/cub/cub/agent/agent_segment_fixup.cuh +++ b/cub/cub/agent/agent_segment_fixup.cuh @@ -50,7 +50,6 @@ #include #include #include -#include #include diff --git a/cub/cub/agent/agent_spmv_orig.cuh b/cub/cub/agent/agent_spmv_orig.cuh index 80d571d58db..90a5e3aa6c9 100644 --- a/cub/cub/agent/agent_spmv_orig.cuh +++ b/cub/cub/agent/agent_spmv_orig.cuh @@ -386,7 +386,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv __syncthreads(); // Search for the thread's starting coordinate within the merge tile + _CCCL_SUPPRESS_DEPRECATED_PUSH CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + _CCCL_SUPPRESS_DEPRECATED_POP CoordinateT thread_start_coord; MergePathSearch( @@ -567,7 +569,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv __syncthreads(); // Search for the thread's starting coordinate within the merge tile + _CCCL_SUPPRESS_DEPRECATED_PUSH CountingInputIterator tile_nonzero_indices(tile_start_coord.y); + _CCCL_SUPPRESS_DEPRECATED_POP CoordinateT thread_start_coord; MergePathSearch( @@ -701,7 +705,9 @@ struct CCCL_DEPRECATED_BECAUSE("Use the cuSPARSE library instead") AgentSpmv // Search our starting coordinates OffsetT diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS; CoordinateT tile_coord; + _CCCL_SUPPRESS_DEPRECATED_PUSH CountingInputIterator nonzero_indices(0); + _CCCL_SUPPRESS_DEPRECATED_POP // Search the merge path MergePathSearch( diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 751cdd46424..06d2d9c0900 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -199,7 +199,9 @@ struct DeviceRunLengthEncode using length_t = cub::detail::non_void_value_t; // Generator type for providing 1s values for run-length reduction + _CCCL_SUPPRESS_DEPRECATED_PUSH using lengths_input_iterator_t = ConstantInputIterator; + _CCCL_SUPPRESS_DEPRECATED_POP using accum_t = ::cuda::std::__accumulator_t; diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index 24ef2845dee..cd377a6d991 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -156,7 +156,9 @@ CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( { OffsetT diagonal = (tile_idx * TILE_ITEMS); CoordinateT tile_coordinate; + _CCCL_SUPPRESS_DEPRECATED_PUSH CountingInputIterator nonzero_indices(0); + _CCCL_SUPPRESS_DEPRECATED_POP // Search the merge path MergePathSearch( diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index a80a853119d..78d755cada0 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -229,7 +229,9 @@ struct dispatch_streaming_arg_reduce_t cudaStream_t stream) { // Constant iterator to provide the offset of the current partition for the user-provided input iterator + _CCCL_SUPPRESS_DEPRECATED_PUSH using constant_offset_it_t = ConstantInputIterator; + _CCCL_SUPPRESS_DEPRECATED_POP // Wrapped input iterator to produce index-value tuples, i.e., -tuples // We make sure to offset the user-provided input iterator by the current partition's offset diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh index 87252292d86..089e5b5869e 100644 --- a/cub/cub/iterator/constant_input_iterator.cuh +++ b/cub/cub/iterator/constant_input_iterator.cuh @@ -87,7 +87,7 @@ CUB_NAMESPACE_BEGIN * The difference type of this iterator (Default: @p ptrdiff_t) */ template -class ConstantInputIterator +class CCCL_DEPRECATED_BECAUSE("Use thrust::constant_iterator instead") ConstantInputIterator { public: // Required iterator traits @@ -216,11 +216,13 @@ public: } /// ostream operator + _CCCL_SUPPRESS_DEPRECATED_PUSH friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "," << itr.offset << "]"; return os; } + _CCCL_SUPPRESS_DEPRECATED_POP }; CUB_NAMESPACE_END diff --git a/cub/cub/iterator/counting_input_iterator.cuh b/cub/cub/iterator/counting_input_iterator.cuh index 529d6a990b1..f24867ebf74 100644 --- a/cub/cub/iterator/counting_input_iterator.cuh +++ b/cub/cub/iterator/counting_input_iterator.cuh @@ -90,7 +90,7 @@ CUB_NAMESPACE_BEGIN * The difference type of this iterator (Default: @p ptrdiff_t) */ template -class CountingInputIterator +class CCCL_DEPRECATED_BECAUSE("Use thrust::counting_iterator instead") CountingInputIterator { public: // Required iterator traits @@ -218,11 +218,13 @@ public: /// ostream operator #if !_CCCL_COMPILER(NVRTC) + _CCCL_SUPPRESS_DEPRECATED_PUSH friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.val << "]"; return os; } + _CCCL_SUPPRESS_DEPRECATED_POP #endif // !_CCCL_COMPILER(NVRTC) }; diff --git a/cub/cub/iterator/discard_output_iterator.cuh b/cub/cub/iterator/discard_output_iterator.cuh index 4b3698d53f3..0d7fe50048a 100644 --- a/cub/cub/iterator/discard_output_iterator.cuh +++ b/cub/cub/iterator/discard_output_iterator.cuh @@ -54,7 +54,7 @@ CUB_NAMESPACE_BEGIN * @brief A discard iterator */ template -class DiscardOutputIterator +class CCCL_DEPRECATED_BECAUSE("Use thrust::discard_iterator instead") DiscardOutputIterator { public: // Required iterator traits @@ -191,11 +191,13 @@ public: } /// ostream operator + _CCCL_SUPPRESS_DEPRECATED_PUSH friend std::ostream& operator<<(std::ostream& os, const self_type& itr) { os << "[" << itr.offset << "]"; return os; } + _CCCL_SUPPRESS_DEPRECATED_POP }; CUB_NAMESPACE_END diff --git a/cub/cub/iterator/transform_input_iterator.cuh b/cub/cub/iterator/transform_input_iterator.cuh index 9e3166afb82..150e28bc0ca 100644 --- a/cub/cub/iterator/transform_input_iterator.cuh +++ b/cub/cub/iterator/transform_input_iterator.cuh @@ -110,7 +110,7 @@ CUB_NAMESPACE_BEGIN * The difference type of this iterator (Default: @p ptrdiff_t) */ template -class TransformInputIterator +class CCCL_DEPRECATED_BECAUSE("Use thrust::transform_iterator instead") TransformInputIterator { public: // Required iterator traits @@ -233,10 +233,12 @@ public: } /// ostream operator + _CCCL_SUPPRESS_DEPRECATED_PUSH friend std::ostream& operator<<(std::ostream& os, const self_type& /* itr */) { return os; } + _CCCL_SUPPRESS_DEPRECATED_POP }; CUB_NAMESPACE_END diff --git a/cub/test/catch2_test_block_run_length_decode.cu b/cub/test/catch2_test_block_run_length_decode.cu index dc322e49f8a..488342ed202 100644 --- a/cub/test/catch2_test_block_run_length_decode.cu +++ b/cub/test/catch2_test_block_run_length_decode.cu @@ -29,10 +29,11 @@ #include #include #include -#include #include #include +#include + #include #include @@ -332,11 +333,11 @@ void TestAlgorithmSpecialisation() using RunItemT = float; using RunLengthT = uint32_t; - using ItemItT = cub::CountingInputIterator; - using RunLengthsItT = cub::TransformInputIterator>; + using ItemItT = thrust::counting_iterator; + using RunLengthsItT = thrust::transform_iterator>; ItemItT d_unique_items(1000U); - RunLengthsItT d_run_lengths(cub::CountingInputIterator(0), ModOp{}); + RunLengthsItT d_run_lengths(thrust::counting_iterator(0), ModOp{}); constexpr uint32_t num_runs = 10000; constexpr uint32_t num_blocks = (num_runs + (RUNS_PER_BLOCK - 1U)) / RUNS_PER_BLOCK; diff --git a/cub/test/catch2_test_block_store.cu b/cub/test/catch2_test_block_store.cu index e5b7883999e..f7f4907e569 100644 --- a/cub/test/catch2_test_block_store.cu +++ b/cub/test/catch2_test_block_store.cu @@ -27,7 +27,6 @@ #include #include -#include #include #include diff --git a/cub/test/catch2_test_device_for.cu b/cub/test/catch2_test_device_for.cu index 3c31fc33278..2526db1b381 100644 --- a/cub/test/catch2_test_device_for.cu +++ b/cub/test/catch2_test_device_for.cu @@ -29,11 +29,11 @@ // above header needs to be included first #include -#include #include #include #include +#include #include #include "catch2_test_launch_helper.h" @@ -260,7 +260,7 @@ C2H_TEST("Device for each works with counting iterator", "[for][device]") max_items, })); - const auto it = cub::CountingInputIterator{0}; + const auto it = thrust::counting_iterator{0}; c2h::device_vector counts(num_items); device_for_each(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())}); diff --git a/cub/test/catch2_test_device_for_copy.cu b/cub/test/catch2_test_device_for_copy.cu index 166da86e255..412e628f6fa 100644 --- a/cub/test/catch2_test_device_for_copy.cu +++ b/cub/test/catch2_test_device_for_copy.cu @@ -29,10 +29,10 @@ // above header needs to be included first #include -#include #include #include +#include #include #include "catch2_test_launch_helper.h" @@ -200,7 +200,7 @@ C2H_TEST("Device for each works with counting iterator", "[for][device]") max_items, })); - const auto it = cub::CountingInputIterator{0}; + const auto it = thrust::counting_iterator{0}; c2h::device_vector counts(num_items); device_for_each_copy(it, it + num_items, incrementer_t{thrust::raw_pointer_cast(counts.data())}); diff --git a/cub/test/catch2_test_device_histogram.cu b/cub/test/catch2_test_device_histogram.cu index 8a4186a406b..e258fb4bcb6 100644 --- a/cub/test/catch2_test_device_histogram.cu +++ b/cub/test/catch2_test_device_histogram.cu @@ -27,7 +27,8 @@ ******************************************************************************/ #include -#include + +#include #include #include @@ -495,7 +496,7 @@ C2H_TEST("DeviceHistogram::HistogramEven sample iterator", "[histogram_even][dev const auto lower_level = caller_vector{0, -10, cs::numeric_limits::lowest()}; const auto upper_level = caller_vector{total_values, 10, cs::numeric_limits::max()}; - auto sample_iterator = cub::CountingInputIterator(0); + auto sample_iterator = thrust::counting_iterator(0); // Channel #0: 0, 4, 8, 12 // Channel #1: 1, 5, 9, 13 @@ -584,7 +585,7 @@ C2H_TEST_LIST("DeviceHistogram::HistogramEven bin computation does not overflow" constexpr sample_t lower_level = 0; constexpr sample_t upper_level = cs::numeric_limits::max(); constexpr auto num_samples = 1000; - auto d_samples = cub::CountingInputIterator{0UL}; + auto d_samples = thrust::counting_iterator{0UL}; auto d_histo_out = c2h::device_vector(1024); const auto num_bins = GENERATE(1, 2); diff --git a/cub/test/catch2_test_device_reduce.cuh b/cub/test/catch2_test_device_reduce.cuh index 6e89b692ed0..ed9a806d19e 100644 --- a/cub/test/catch2_test_device_reduce.cuh +++ b/cub/test/catch2_test_device_reduce.cuh @@ -203,11 +203,11 @@ inline __half* unwrap_it(half_t* it) } template -inline cub::ConstantInputIterator<__half, OffsetT> unwrap_it(cub::ConstantInputIterator it) +inline thrust::constant_iterator<__half, OffsetT> unwrap_it(thrust::constant_iterator it) { half_t wrapped_val = *it; __half val = wrapped_val.operator __half(); - return cub::ConstantInputIterator<__half, OffsetT>(val); + return thrust::constant_iterator<__half, OffsetT>(val); } #endif @@ -218,11 +218,11 @@ inline __nv_bfloat16* unwrap_it(bfloat16_t* it) } template -cub::ConstantInputIterator<__nv_bfloat16, OffsetT> inline unwrap_it(cub::ConstantInputIterator it) +thrust::constant_iterator<__nv_bfloat16, OffsetT> inline unwrap_it(thrust::constant_iterator it) { bfloat16_t wrapped_val = *it; __nv_bfloat16 val = wrapped_val.operator __nv_bfloat16(); - return cub::ConstantInputIterator<__nv_bfloat16, OffsetT>(val); + return thrust::constant_iterator<__nv_bfloat16, OffsetT>(val); } #endif diff --git a/cub/test/catch2_test_iterator.cu b/cub/test/catch2_test_iterator.cu index a5dd52dfe04..c94ea59773b 100644 --- a/cub/test/catch2_test_iterator.cu +++ b/cub/test/catch2_test_iterator.cu @@ -26,6 +26,14 @@ * ******************************************************************************/ +#include + +// with NVHPC we get deprecation warnings originating from instantiations from cudafe1.stub.c, so we have to bulk +// suppress all deprecation warnings in this file (without a matching pop) +#if _CCCL_COMPILER(NVHPC) +_CCCL_SUPPRESS_DEPRECATED_PUSH +#endif + #include #include #include @@ -91,8 +99,10 @@ __global__ void test_iterator_kernel(InputIteratorT d_in, T* d_out, InputIterato d_itrs[1] = d_in; // Iterator at offset 0 } +_CCCL_SUPPRESS_DEPRECATED_PUSH template -void test_iterator(InputIteratorT d_in, const c2h::host_vector& h_reference) +void test_iterator(InputIteratorT d_in, const c2h::host_vector& h_reference) // + _CCCL_SUPPRESS_DEPRECATED_POP { c2h::device_vector d_out(h_reference.size()); c2h::device_vector d_itrs(2, d_in); // TODO(bgruber): using a raw allocation halves the compile time @@ -113,7 +123,9 @@ C2H_TEST("Test constant iterator", "[iterator]", scalar_types) using T = c2h::get<0, TestType>; const T base = static_cast(GENERATE(0, 99)); const auto h_reference = c2h::host_vector{base, base, base, base, base, base, base, base}; + _CCCL_SUPPRESS_DEPRECATED_PUSH test_iterator(cub::ConstantInputIterator(base), h_reference); + _CCCL_SUPPRESS_DEPRECATED_POP } C2H_TEST("Test counting iterator", "[iterator]", scalar_types) @@ -129,7 +141,9 @@ C2H_TEST("Test counting iterator", "[iterator]", scalar_types) static_cast(base + 21), static_cast(base + 11), static_cast(base + 0)}; + _CCCL_SUPPRESS_DEPRECATED_PUSH test_iterator(cub::CountingInputIterator(base), h_reference); + _CCCL_SUPPRESS_DEPRECATED_POP } using cache_modifiers = @@ -187,9 +201,11 @@ C2H_TEST("Test transform iterator", "[iterator]", types) op(h_data[21]), op(h_data[11]), op(h_data[0])}; + _CCCL_SUPPRESS_DEPRECATED_PUSH test_iterator(cub::TransformInputIterator, const T*>( const_cast(const_cast(thrust::raw_pointer_cast(d_data.data()))), op), h_reference); + _CCCL_SUPPRESS_DEPRECATED_POP } C2H_TEST("Test tex-obj texture iterator", "[iterator]", types) @@ -233,7 +249,9 @@ C2H_TEST("Test texture transform iterator", "[iterator]", types) TextureIterator d_tex_itr; CubDebugExit( d_tex_itr.BindTexture(const_cast(thrust::raw_pointer_cast(d_data.data())), sizeof(T) * TEST_VALUES)); + _CCCL_SUPPRESS_DEPRECATED_PUSH cub::TransformInputIterator, TextureIterator> xform_itr(d_tex_itr, op); + _CCCL_SUPPRESS_DEPRECATED_POP test_iterator(xform_itr, h_reference); CubDebugExit(d_tex_itr.UnbindTexture()); } diff --git a/cub/test/catch2_test_util_type.cu b/cub/test/catch2_test_util_type.cu index aba45096cd0..608571e2d94 100644 --- a/cub/test/catch2_test_util_type.cu +++ b/cub/test/catch2_test_util_type.cu @@ -25,10 +25,11 @@ * ******************************************************************************/ -#include -#include #include +#include +#include + #include #include @@ -36,8 +37,8 @@ C2H_TEST("Tests non_void_value_t", "[util][type]") { using fallback_t = float; - using void_fancy_it = cub::DiscardOutputIterator; - using non_void_fancy_it = cub::CountingInputIterator; + using void_fancy_it = thrust::discard_iterator; + using non_void_fancy_it = thrust::counting_iterator; // falls back for const void* STATIC_REQUIRE(::cuda::std::is_same -#include #include #include +#include #include #include #include @@ -283,12 +283,12 @@ void RunTest(BufferOffsetT num_buffers, // Prepare d_buffer_srcs OffsetToPtrOp src_transform_op{static_cast(thrust::raw_pointer_cast(d_in.data()))}; - cub::TransformInputIterator, ByteOffsetT*> d_buffer_srcs( + thrust::transform_iterator, ByteOffsetT*> d_buffer_srcs( thrust::raw_pointer_cast(d_buffer_src_offsets.data()), src_transform_op); // Prepare d_buffer_dsts OffsetToPtrOp dst_transform_op{static_cast(thrust::raw_pointer_cast(d_out.data()))}; - cub::TransformInputIterator, ByteOffsetT*> d_buffer_dsts( + thrust::transform_iterator, ByteOffsetT*> d_buffer_dsts( thrust::raw_pointer_cast(d_buffer_dst_offsets.data()), dst_transform_op); // Get temporary storage requirements diff --git a/cub/test/test_util.h b/cub/test/test_util.h index c06d803ecb1..031298120dc 100644 --- a/cub/test/test_util.h +++ b/cub/test/test_util.h @@ -35,7 +35,6 @@ # include #endif -#include #include #include #include @@ -44,6 +43,8 @@ #include #include +#include + #include #include #include @@ -1234,7 +1235,7 @@ inline int CompareDeviceResults( template int CompareDeviceResults( S* /*h_reference*/, - CUB_NS_QUALIFIER::DiscardOutputIterator /*d_data*/, + THRUST_NS_QUALIFIER::discard_iterator /*d_data*/, std::size_t /*num_items*/, bool /*verbose*/ = true, bool /*display_data*/ = false) From 52e580a6224fafd68f24bf5ecf9a7b8fefaee0f3 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 23 Jan 2025 00:22:39 +0100 Subject: [PATCH 2/8] missing include --- cub/cub/device/device_run_length_encode.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index 06d2d9c0900..fe0b93dff74 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -48,6 +48,7 @@ #include #include #include +#include #include From 9780a485008ae007a7bc4c786dd55b71583ef902 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 23 Jan 2025 00:41:02 +0100 Subject: [PATCH 3/8] WOrkaround nvhoc --- .../dispatch/dispatch_streaming_reduce.cuh | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 78d755cada0..bf7ab531b63 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -13,10 +13,6 @@ # pragma system_header #endif // no system header -_CCCL_SUPPRESS_DEPRECATED_PUSH -#include -_CCCL_SUPPRESS_DEPRECATED_POP - #include #include #include @@ -24,10 +20,13 @@ _CCCL_SUPPRESS_DEPRECATED_POP #include #include +#include #include #ifndef _CCCL_DOXYGEN_INVOKED // Do not document +// suppress deprecation warnings for ConstantInputIterator. NVHPC makes them appear more widely than necessary +_CCCL_SUPPRESS_DEPRECATED_PUSH CUB_NAMESPACE_BEGIN namespace detail::reduce @@ -190,6 +189,11 @@ template , PerPartitionOffsetT, ReductionOpT>> struct dispatch_streaming_arg_reduce_t { +# if _CCCL_COMPILER(NVHPC) + // NVHPC fails to suppress a deprecation when the alias is inside the function below + using constant_offset_it_t = ConstantInputIterator; +# endif // _CCCL_COMPILER(NVHPC) + // Internal dispatch routine for computing a device-wide argument extremum, like `ArgMin` and `ArgMax` // // @param[in] d_temp_storage @@ -229,9 +233,9 @@ struct dispatch_streaming_arg_reduce_t cudaStream_t stream) { // Constant iterator to provide the offset of the current partition for the user-provided input iterator - _CCCL_SUPPRESS_DEPRECATED_PUSH +# if !_CCCL_COMPILER(NVHPC) using constant_offset_it_t = ConstantInputIterator; - _CCCL_SUPPRESS_DEPRECATED_POP +# endif // Wrapped input iterator to produce index-value tuples, i.e., -tuples // We make sure to offset the user-provided input iterator by the current partition's offset @@ -375,7 +379,7 @@ struct dispatch_streaming_arg_reduce_t }; } // namespace detail::reduce - +_CCCL_SUPPRESS_DEPRECATED_POP CUB_NAMESPACE_END #endif // !_CCCL_DOXYGEN_INVOKED From 531e64710dc7d835d8aaedbf860bc9ac230bbfc6 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 23 Jan 2025 18:46:39 +0100 Subject: [PATCH 4/8] nvhpc suppression workarounds --- cub/cub/device/dispatch/dispatch_streaming_reduce.cuh | 9 ++++++++- cub/cub/iterator/constant_input_iterator.cuh | 8 +++++++- cub/test/catch2_test_device_run_length_encode.cu | 11 +++++++++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index bf7ab531b63..4c80f9646c8 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -25,8 +25,10 @@ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document +# if _CCCL_COMPILER(NVHPC) // suppress deprecation warnings for ConstantInputIterator. NVHPC makes them appear more widely than necessary _CCCL_SUPPRESS_DEPRECATED_PUSH +# endif // _CCCL_COMPILER(NVHPC) CUB_NAMESPACE_BEGIN namespace detail::reduce @@ -190,7 +192,8 @@ template ; # endif // _CCCL_COMPILER(NVHPC) @@ -234,7 +237,9 @@ struct dispatch_streaming_arg_reduce_t { // Constant iterator to provide the offset of the current partition for the user-provided input iterator # if !_CCCL_COMPILER(NVHPC) + _CCCL_SUPPRESS_DEPRECATED_PUSH using constant_offset_it_t = ConstantInputIterator; + _CCCL_SUPPRESS_DEPRECATED_POP # endif // Wrapped input iterator to produce index-value tuples, i.e., -tuples @@ -379,7 +384,9 @@ struct dispatch_streaming_arg_reduce_t }; } // namespace detail::reduce +# if _CCCL_COMPILER(NVHPC) _CCCL_SUPPRESS_DEPRECATED_POP +# endif // _CCCL_COMPILER(NVHPC) CUB_NAMESPACE_END #endif // !_CCCL_DOXYGEN_INVOKED diff --git a/cub/cub/iterator/constant_input_iterator.cuh b/cub/cub/iterator/constant_input_iterator.cuh index 089e5b5869e..a9e1813ad40 100644 --- a/cub/cub/iterator/constant_input_iterator.cuh +++ b/cub/cub/iterator/constant_input_iterator.cuh @@ -87,7 +87,13 @@ CUB_NAMESPACE_BEGIN * The difference type of this iterator (Default: @p ptrdiff_t) */ template -class CCCL_DEPRECATED_BECAUSE("Use thrust::constant_iterator instead") ConstantInputIterator +class +#ifndef __CUDA_ARCH__ + // Avoid generating a deprecation warning from length_encode.compute_xx.cpp1.ii, which is compiled by cicc for which + // we cannot suppress the warning + CCCL_DEPRECATED_BECAUSE("Use thrust::constant_iterator instead") +#endif + ConstantInputIterator { public: // Required iterator traits diff --git a/cub/test/catch2_test_device_run_length_encode.cu b/cub/test/catch2_test_device_run_length_encode.cu index 4fee40e99ef..1df22c77b17 100644 --- a/cub/test/catch2_test_device_run_length_encode.cu +++ b/cub/test/catch2_test_device_run_length_encode.cu @@ -25,6 +25,13 @@ * ******************************************************************************/ +#include + +#if _CCCL_COMPILER(NVHPC) +// to suppress warnings for CountingInputIterator +_CCCL_SUPPRESS_DEPRECATED_PUSH +#endif // _CCCL_COMPILER(NVHPC) + #include "insert_nested_NVTX_range_guard.h" // above header needs to be included first @@ -264,3 +271,7 @@ C2H_TEST("DeviceRunLengthEncode::Encode can handle leading NaN", "[device][run_l REQUIRE(out_counts == reference_counts); REQUIRE(out_num_runs == reference_num_runs); } + +#if _CCCL_COMPILER(NVHPC) +_CCCL_SUPPRESS_DEPRECATED_POP +#endif // _CCCL_COMPILER(NVHPC) From 0da76d82ac17316d9ba94c16248a10bf77ef2e18 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 24 Jan 2025 12:07:55 +0100 Subject: [PATCH 5/8] More suppression --- cub/test/catch2_test_device_reduce.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index d3a54bf64f2..d6cfdb11521 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -39,6 +39,10 @@ #include #include +// need to suppress deprecation warnings for ConstantInputIterator in the cudafe1.stub.c file, so there is no matching +// _CCCL_SUPPRESS_DEPRECATED_POP at the end of this file +_CCCL_SUPPRESS_DEPRECATED_PUSH + DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Reduce, device_reduce); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Sum, device_sum); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Min, device_min); From 7667afaef51d3f371bd76737df27b61cb3e0a2bb Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 24 Jan 2025 12:38:36 +0100 Subject: [PATCH 6/8] Wider suppression --- cub/cub/device/dispatch/dispatch_streaming_reduce.cuh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 4c80f9646c8..9f7a22db741 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -25,10 +25,8 @@ #ifndef _CCCL_DOXYGEN_INVOKED // Do not document -# if _CCCL_COMPILER(NVHPC) -// suppress deprecation warnings for ConstantInputIterator. NVHPC makes them appear more widely than necessary +// suppress deprecation warnings for ConstantInputIterator _CCCL_SUPPRESS_DEPRECATED_PUSH -# endif // _CCCL_COMPILER(NVHPC) CUB_NAMESPACE_BEGIN namespace detail::reduce @@ -384,9 +382,7 @@ struct dispatch_streaming_arg_reduce_t }; } // namespace detail::reduce -# if _CCCL_COMPILER(NVHPC) _CCCL_SUPPRESS_DEPRECATED_POP -# endif // _CCCL_COMPILER(NVHPC) CUB_NAMESPACE_END #endif // !_CCCL_DOXYGEN_INVOKED From 5d4aa86ce043990a22db485b85015d88ecf5a540 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 24 Jan 2025 15:09:46 +0100 Subject: [PATCH 7/8] Fixes --- cub/cub/device/device_run_length_encode.cuh | 2 ++ cub/test/catch2_test_util_type.cu | 11 ++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cub/cub/device/device_run_length_encode.cuh b/cub/cub/device/device_run_length_encode.cuh index fe0b93dff74..72d134c8c04 100644 --- a/cub/cub/device/device_run_length_encode.cuh +++ b/cub/cub/device/device_run_length_encode.cuh @@ -210,6 +210,7 @@ struct DeviceRunLengthEncode using policy_t = detail::rle::encode::policy_hub; + _CCCL_SUPPRESS_DEPRECATED_PUSH return DispatchReduceByKey< InputIteratorT, UniqueOutputIteratorT, @@ -231,6 +232,7 @@ struct DeviceRunLengthEncode reduction_op(), num_items, stream); + _CCCL_SUPPRESS_DEPRECATED_POP } //! @rst diff --git a/cub/test/catch2_test_util_type.cu b/cub/test/catch2_test_util_type.cu index 608571e2d94..9e06b57edc9 100644 --- a/cub/test/catch2_test_util_type.cu +++ b/cub/test/catch2_test_util_type.cu @@ -25,20 +25,20 @@ * ******************************************************************************/ +#include +#include #include -#include -#include - #include #include C2H_TEST("Tests non_void_value_t", "[util][type]") { + _CCCL_SUPPRESS_DEPRECATED_PUSH using fallback_t = float; - using void_fancy_it = thrust::discard_iterator; - using non_void_fancy_it = thrust::counting_iterator; + using void_fancy_it = cub::DiscardOutputIterator; + using non_void_fancy_it = cub::CountingInputIterator; // falls back for const void* STATIC_REQUIRE(::cuda::std::is_same>::value); + _CCCL_SUPPRESS_DEPRECATED_POP } CUB_DEFINE_DETECT_NESTED_TYPE(cat_detect, cat); From e8499df009a9dbdac3a26d2a321b22c8bda2aac4 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Sun, 26 Jan 2025 20:42:20 +0100 Subject: [PATCH 8/8] MOre suppressino --- cub/test/catch2_test_device_reduce_fp_inf.cu | 3 +++ cub/test/catch2_test_device_reduce_large_offsets.cu | 3 +++ cub/test/catch2_test_device_run_length_encode.cu | 3 +++ 3 files changed, 9 insertions(+) diff --git a/cub/test/catch2_test_device_reduce_fp_inf.cu b/cub/test/catch2_test_device_reduce_fp_inf.cu index 101a30f8b65..17b49f1c651 100644 --- a/cub/test/catch2_test_device_reduce_fp_inf.cu +++ b/cub/test/catch2_test_device_reduce_fp_inf.cu @@ -45,6 +45,9 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMin, device_arg_min_old); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max_old); _CCCL_SUPPRESS_DEPRECATED_POP +// suppress deprecation of ConstantInputIterator in cudafe1.stub.c file +_CCCL_SUPPRESS_DEPRECATED_PUSH + // %PARAM% TEST_LAUNCH lid 0:1 C2H_TEST("Device reduce arg{min,max} works with inf items", "[reduce][device]") diff --git a/cub/test/catch2_test_device_reduce_large_offsets.cu b/cub/test/catch2_test_device_reduce_large_offsets.cu index 2f9123d8658..18b7ba5d205 100644 --- a/cub/test/catch2_test_device_reduce_large_offsets.cu +++ b/cub/test/catch2_test_device_reduce_large_offsets.cu @@ -24,6 +24,9 @@ DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMin, device_arg_min); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::Max, device_max); DECLARE_LAUNCH_WRAPPER(cub::DeviceReduce::ArgMax, device_arg_max); +// suppress deprecation of ConstantInputIterator in cudafe1.stub.c file +_CCCL_SUPPRESS_DEPRECATED_PUSH + // %PARAM% TEST_LAUNCH lid 0:1:2 // List of offset types to test diff --git a/cub/test/catch2_test_device_run_length_encode.cu b/cub/test/catch2_test_device_run_length_encode.cu index 1df22c77b17..b001f619958 100644 --- a/cub/test/catch2_test_device_run_length_encode.cu +++ b/cub/test/catch2_test_device_run_length_encode.cu @@ -50,6 +50,9 @@ _CCCL_SUPPRESS_DEPRECATED_PUSH DECLARE_LAUNCH_WRAPPER(cub::DeviceRunLengthEncode::Encode, run_length_encode); +// suppress deprecation of ConstantInputIterator in cudafe1.stub.c file +_CCCL_SUPPRESS_DEPRECATED_PUSH + // %PARAM% TEST_LAUNCH lid 0:1:2 using all_types =