Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improves DeviceSegmentedSort test run time for large number of items and segments #3246

Merged
merged 8 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 5 additions & 26 deletions cub/test/catch2_radix_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
#include <thrust/sequence.h>

#include <cuda/std/bit>
#include <cuda/std/functional>

#include <array>
#include <climits>
Expand All @@ -54,43 +55,21 @@
// Index types used for OffsetsT testing
using offset_types = c2h::type_list<cuda::std::int32_t, cuda::std::uint64_t>;
using all_offset_types =
c2h::type_list<cuda::std::int32_t, cuda::std::uint32_t, cuda::std::int64_t, cuda::std::uint64_t>;
c2h::type_list<cuda::std::int64_t, cuda::std::uint64_t, cuda::std::int32_t, cuda::std::uint32_t>;

// Create a segment iterator that returns the next multiple of Step except for a few cases. This allows to save memory
template <typename OffsetT, OffsetT Step>
struct segment_iterator
{
OffsetT last = 0;

segment_iterator(OffsetT last1)
segment_iterator(std::int64_t last1)
: last{last1}
{}

__host__ __device__ OffsetT operator()(OffsetT x) const
__host__ __device__ OffsetT operator()(std::int64_t x) const
{
switch (x)
{
case Step * 100:
return Step * 100 + Step / 2;
case Step * 200:
return Step * 200 + Step / 2;
case Step * 300:
return Step * 300 + Step / 2;
case Step * 400:
return Step * 400 + Step / 2;
case Step * 500:
return Step * 500 + Step / 2;
case Step * 600:
return Step * 600 + Step / 2;
case Step * 700:
return Step * 700 + Step / 2;
case Step * 800:
return Step * 800 + Step / 2;
case Step * 900:
return Step * 900 + Step / 2;
default:
return (x >= last) ? last : x * Step;
}
return ::cuda::std::min(last, x * Step);
elstehle marked this conversation as resolved.
Show resolved Hide resolved
}
};

Expand Down
192 changes: 191 additions & 1 deletion cub/test/catch2_segmented_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <thrust/scan.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/unique.h>

#include <cuda/std/limits>
#include <cuda/std/tuple>
Expand All @@ -46,11 +47,11 @@

#include <cstdio>

#include "catch2_test_launch_helper.h"
#include <c2h/catch2_test_helper.h>
#include <c2h/cpu_timer.h>
#include <c2h/extended_types.h>
#include <c2h/utility.h>
#include <catch2_test_launch_helper.h>
#include <nv/target>

#define MAKE_SEED_MOD_FUNCTION(name, xor_mask) \
Expand All @@ -71,6 +72,195 @@ MAKE_SEED_MOD_FUNCTION(offset_eraser, 0x3333333333333333)

#undef MAKE_SEED_MOD_FUNCTION

// Helper to generate a certain number of empty segments followed by equi-sized segments.
template <typename OffsetT, typename SegmentIndexT>
struct segment_index_to_offset_op
{
SegmentIndexT num_empty_segments;
SegmentIndexT num_segments;
OffsetT segment_size;
OffsetT num_items;

_CCCL_HOST_DEVICE __forceinline__ OffsetT operator()(SegmentIndexT i)
{
if (i < num_empty_segments)
{
return 0;
}
else if (i < num_segments)
{
return segment_size * static_cast<OffsetT>(i - num_empty_segments);
}
else
{
return num_items;
}
}
};

template <typename T>
struct mod_n
{
std::size_t mod;

template <typename IndexT>
_CCCL_HOST_DEVICE __forceinline__ T operator()(IndexT x)
{
return static_cast<T>(x % mod);
}
};

template <typename KeyT>
class short_key_verification_helper
{
private:
using key_t = KeyT;
// The histogram size of the keys being sorted for later verification
const std::int64_t max_histo_size = std::int64_t{1} << ::cuda::std::numeric_limits<key_t>::digits;

// Holding the histogram of the keys being sorted for verification
c2h::host_vector<std::size_t> keys_histogram{};

public:
void prepare_verification_data(const c2h::device_vector<key_t>& in_keys)
{
c2h::host_vector<key_t> h_in{in_keys};
keys_histogram = c2h::host_vector<std::size_t>(max_histo_size, 0);
for (const auto& key : h_in)
{
keys_histogram[key]++;
}
}

void verify_sorted(const c2h::device_vector<key_t>& out_keys) const
{
// Verify keys are sorted next to each other
auto count = thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>());
REQUIRE(count <= max_histo_size);

// Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

for (int i = 0; i < count; i++)
{
auto const next_end = (i == count - 1) ? out_keys.size() : unique_indexes_out[i + 1];
REQUIRE(keys_histogram[unique_keys_out[i]] == next_end - unique_indexes_out[i]);
}
}
};

template <typename KeyT>
class segmented_verification_helper
{
private:
using key_t = KeyT;
const std::size_t sequence_length{};

// Analytically computes the histogram for a segment of a series of keys: [0, 1, 2, ..., mod_n - 1, 0, 1, 2, ...].
// `segment_end` is one-past-the-end of the segment to compute the histogram for.
c2h::host_vector<int> compute_histogram_of_series(std::size_t segment_offset, std::size_t segment_end) const
{
// The i-th full cycle begins after segment_offset
const auto start_cycle = cuda::ceil_div(segment_offset, sequence_length);

// The last full cycle ending before segment_end
const auto end_cycle = segment_end / sequence_length;

// Number of full cycles repeating the sequence
const int full_cycles = (end_cycle > start_cycle) ? static_cast<int>(end_cycle - start_cycle) : 0;

// Add contributions from full cycles
c2h::host_vector<int> histogram(sequence_length, full_cycles);

// Partial cycles preceding the first full cycle
for (std::size_t j = segment_offset; j < start_cycle * sequence_length; ++j)
{
const auto value = j % sequence_length;
histogram[value]++;
}

// Partial cycles following the last full cycle
for (std::size_t j = end_cycle * sequence_length; j < segment_end; ++j)
{
const auto value = j % sequence_length;
histogram[value]++;
}
return histogram;
}

public:
segmented_verification_helper(int sequence_length)
: sequence_length(sequence_length)
{}

void prepare_input_data(c2h::device_vector<key_t>& in_keys) const
{
auto data_gen_it =
thrust::make_transform_iterator(thrust::make_counting_iterator(std::size_t{0}), mod_n<key_t>{sequence_length});
thrust::copy_n(data_gen_it, in_keys.size(), in_keys.begin());
}

template <typename SegmentOffsetItT>
void verify_sorted(c2h::device_vector<key_t>& out_keys, SegmentOffsetItT offsets, std::size_t num_segments) const
{
// The segments' end-offsets are provided by the segments' begin-offset iterator
auto offsets_plus_1 = offsets + 1;

// Verify keys are sorted next to each other
const auto count = static_cast<std::size_t>(
thrust::unique_count(c2h::device_policy, out_keys.cbegin(), out_keys.cend(), thrust::equal_to<int>()));
REQUIRE(count <= sequence_length * num_segments);

// // Verify keys are sorted using prior histogram computation
auto index_it = thrust::make_counting_iterator(std::size_t{0});
c2h::device_vector<key_t> unique_keys_out(count);
c2h::device_vector<std::size_t> unique_indexes_out(count);
thrust::unique_by_key_copy(
c2h::device_policy,
out_keys.cbegin(),
out_keys.cend(),
index_it,
unique_keys_out.begin(),
unique_indexes_out.begin());

// Copy the unique keys and indexes to host memory
c2h::host_vector<key_t> h_unique_keys_out{unique_keys_out};
c2h::host_vector<std::size_t> h_unique_indexes_out{unique_indexes_out};

// Verify keys are sorted using prior histogram computation
std::size_t uniques_index = 0;
std::size_t current_offset = 0;
for (std::size_t seg_index = 0; seg_index < num_segments; ++seg_index)
{
const auto segment_offset = offsets[seg_index];
const auto segment_end = offsets_plus_1[seg_index];
const auto segment_histogram = compute_histogram_of_series(segment_offset, segment_end);
for (std::size_t i = 0; i < sequence_length; i++)
{
if (segment_histogram[i] != 0)
{
CAPTURE(seg_index, i, uniques_index, current_offset, count);
auto const next_end =
(uniques_index == count - 1) ? out_keys.size() : h_unique_indexes_out[uniques_index + 1];
REQUIRE(h_unique_keys_out[uniques_index] == i);
REQUIRE(next_end - h_unique_indexes_out[uniques_index] == segment_histogram[i]);
current_offset += segment_histogram[i];
uniques_index++;
}
}
}
}
};

template <typename T>
struct unwrap_value_t_impl
{
Expand Down
Loading
Loading