Skip to content

Commit

Permalink
Add seed parameter to hash_character_ngrams (#17643)
Browse files Browse the repository at this point in the history
Adds a seed parameter to the `nvtext::hash_character_ngrams` API.
Makes this more useful in conjunction with other nvtext APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #17643
  • Loading branch information
davidwendt authored Jan 13, 2025
1 parent dc2a75c commit fdd4255
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 48 deletions.
39 changes: 0 additions & 39 deletions cpp/include/nvtext/detail/generate_ngrams.hpp

This file was deleted.

4 changes: 3 additions & 1 deletion cpp/include/nvtext/generate_ngrams.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -117,13 +117,15 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
*
* @param input Strings column to produce ngrams from
* @param ngrams The ngram number to generate. Default is 5.
* @param seed The seed value to use with the hash algorithm. Default is 0.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return A lists column of hash values
*/
std::unique_ptr<cudf::column> hash_character_ngrams(
cudf::strings_column_view const& input,
cudf::size_type ngrams = 5,
uint32_t seed = 0,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

Expand Down
13 changes: 8 additions & 5 deletions cpp/src/text/generate_ngrams.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -32,7 +32,7 @@
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <nvtext/detail/generate_ngrams.hpp>
#include <nvtext/generate_ngrams.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>
Expand Down Expand Up @@ -315,6 +315,7 @@ namespace {
*/
CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
cudf::size_type ngrams,
uint32_t seed,
cudf::size_type const* d_ngram_offsets,
cudf::hash_value_type* d_results)
{
Expand All @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
__shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values

auto const ngram_offset = d_ngram_offsets[str_idx];
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seed};

auto const end = d_str.data() + d_str.size_bytes();
auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
Expand Down Expand Up @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
cudf::size_type ngrams,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
Expand Down Expand Up @@ -400,7 +402,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();

character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*d_strings, ngrams, d_offsets, d_hashes);
*d_strings, ngrams, seed, d_offsets, d_hashes);

return make_lists_column(
input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
Expand All @@ -419,11 +421,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie

std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
cudf::size_type ngrams,
uint32_t seed,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::hash_character_ngrams(strings, ngrams, stream, mr);
return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr);
}

} // namespace nvtext
4 changes: 2 additions & 2 deletions cpp/tests/streams/text/ngrams_test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2023-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams)
auto input =
cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
nvtext::hash_character_ngrams(
cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream());
}

TEST_F(TextNGramsTest, NgramsTokenize)
Expand Down
13 changes: 12 additions & 1 deletion cpp/tests/text/ngrams_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash)
2319357747u}});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = nvtext::hash_character_ngrams(view, 10, 10);
// clang-format off
LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u,
2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u},
LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u,
3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u,
4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u}
});
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
Expand Down

0 comments on commit fdd4255

Please sign in to comment.