diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9dabe4e8800..354560998c5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -336,6 +336,7 @@ add_library(
src/aggregation/result_cache.cpp
src/ast/expression_parser.cpp
src/ast/expressions.cpp
+ src/ast/operators.cpp
src/binaryop/binaryop.cpp
src/binaryop/compiled/ATan2.cu
src/binaryop/compiled/Add.cu
@@ -477,13 +478,13 @@ add_library(
src/io/avro/reader_impl.cu
src/io/comp/brotli_dict.cpp
src/io/comp/comp.cpp
+ src/io/comp/comp.cu
src/io/comp/cpu_unbz2.cpp
src/io/comp/debrotli.cu
src/io/comp/gpuinflate.cu
src/io/comp/nvcomp_adapter.cpp
src/io/comp/nvcomp_adapter.cu
src/io/comp/snap.cu
- src/io/comp/statistics.cu
src/io/comp/uncomp.cpp
src/io/comp/unsnap.cu
src/io/csv/csv_gpu.cu
@@ -515,6 +516,7 @@ add_library(
src/datetime/timezone.cpp
src/io/orc/writer_impl.cu
src/io/parquet/arrow_schema_writer.cpp
+ src/io/parquet/bloom_filter_reader.cu
src/io/parquet/compact_protocol_reader.cpp
src/io/parquet/compact_protocol_writer.cpp
src/io/parquet/decode_preprocess.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a1b7db5e08a..2b2a660bed7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -601,7 +601,7 @@ std::unique_ptr make_udf_aggregation(udf_type type,
data_type output_type);
// Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
-struct host_udf_base;
+class host_udf_base;
/**
* @brief Factory to create a HOST_UDF aggregation.
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
index bbce76dc5f3..451d75137e4 100644
--- a/cpp/include/cudf/aggregation/host_udf.hpp
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -17,18 +17,16 @@
#pragma once
#include
+#include
#include
#include
#include
-#include
#include
#include
+#include
#include
-#include
-#include
-#include
/**
* @file host_udf.hpp
@@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf {
*/
/**
- * @brief The interface for host-based UDF implementation.
+ * @brief The fundamental interface for host-based UDF implementation.
*
- * An implementation of host-based UDF needs to be derived from this base class, defining
- * its own version of the required functions. In particular:
- * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
- * and `clone` functions.
- * - If necessary, the derived class can also override `do_hash` to compute hashing for its
- * instance, and `get_required_data` to selectively access to the input data as well as
- * intermediate data provided by libcudf.
+ * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in
+ * the users' UDF implementation. These functions are required for libcudf aggregation framework
+ * to perform its operations.
+ */
+class host_udf_base {
+ // Declare constructor private to prevent the users from deriving from this class.
+ private:
+ host_udf_base() = default; ///< Default constructor
+
+ // Only allow deriving from the structs below.
+ friend struct reduce_host_udf;
+ friend struct segmented_reduce_host_udf;
+ friend struct groupby_host_udf;
+
+ public:
+ virtual ~host_udf_base() = default; ///< Default destructor
+
+ /**
+ * @brief Computes hash value of the instance.
+ *
+ * Overriding this function is optional when the derived class has data members such that
+ * each instance needs to be differentiated from each other.
+ *
+ * @return The hash value of the instance
+ */
+ [[nodiscard]] virtual std::size_t do_hash() const
+ {
+ return std::hash{}(static_cast(aggregation::Kind::HOST_UDF));
+ }
+
+ /**
+ * @brief Compares two instances of the derived class for equality.
+ * @param other The other instance to compare with
+ * @return True if the two instances are equal
+ */
+ [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+ /**
+ * @brief Clones the instance.
+ *
+ * The instances of the derived class should be lightweight for efficient cloning.
+ *
+ * @return A new instance cloned from this one
+ */
+ [[nodiscard]] virtual std::unique_ptr clone() const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for reduction contexts.
+ *
+ * An implementation of host-based UDF for reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived classes must also define the `operator()` function to perform reduction
+ * operations.
*
- * Example of such implementation:
+ * Example:
* @code{.cpp}
- * struct my_udf_aggregation : cudf::host_udf_base {
+ * struct my_udf_aggregation : cudf::reduce_host_udf {
* my_udf_aggregation() = default;
*
- * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
- * // and the result from groupby `MAX` aggregation.
- * [[nodiscard]] data_attribute_set_t get_required_data() const override
+ * [[nodiscard]] std::unique_ptr operator()(
+ * column_view const& input,
+ * data_type output_dtype,
+ * std::optional> init,
+ * rmm::cuda_stream_view stream,
+ * rmm::device_async_resource_ref mr) const override
* {
- * return {groupby_data_attribute::GROUPED_VALUES,
- * groupby_data_attribute::GROUP_OFFSETS,
- * cudf::make_max_aggregation()};
+ * // Perform reduction computation using the input data and return the reduction result.
+ * // This is where the actual reduction logic is implemented.
* }
*
- * [[nodiscard]] output_t get_empty_output(
- * [[maybe_unused]] std::optional output_dtype,
- * [[maybe_unused]] rmm::cuda_stream_view stream,
- * [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ * [[nodiscard]] bool is_equal(host_udf_base const& other) const override
* {
- * // This UDF aggregation always returns a column of type INT32.
- * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ * // Check if the other object is also instance of this class.
+ * // If there are internal state variables, they may need to be checked for equality as well.
+ * return dynamic_cast(&other) != nullptr;
* }
*
- * [[nodiscard]] output_t operator()(input_map_t const& input,
- * rmm::cuda_stream_view stream,
- * rmm::device_async_resource_ref mr) const override
+ * [[nodiscard]] std::unique_ptr clone() const override
* {
- * // Perform UDF computation using the input data and return the result.
+ * return std::make_unique();
+ * }
+ * };
+ * @endcode
+ */
+struct reduce_host_udf : host_udf_base {
+ /**
+ * @brief Perform reduction operations.
+ *
+ * @param input The input column for reduction
+ * @param output_dtype The data type for the final output scalar
+ * @param init The initial value of the reduction
+ * @param stream The CUDA stream to use for any kernel launches
+ * @param mr Device memory resource to use for any allocations
+ * @return The output result of the aggregation
+ */
+ [[nodiscard]] virtual std::unique_ptr operator()(
+ column_view const& input,
+ data_type output_dtype,
+ std::optional> init,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for segmented reduction context.
+ *
+ * An implementation of host-based UDF for segmented reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived class must also define the `operator()` function to perform segmented reduction.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::segmented_reduce_host_udf {
+ * my_udf_aggregation() = default;
+ *
+ * [[nodiscard]] std::unique_ptr operator()(
+ * column_view const& input,
+ * device_span offsets,
+ * data_type output_dtype,
+ * null_policy null_handling,
+ * std::optional> init,
+ * rmm::cuda_stream_view stream,
+ * rmm::device_async_resource_ref mr) const override
+ * {
+ * // Perform computation using the input data and return the result.
+ * // This is where the actual segmented reduction logic is implemented.
* }
*
* [[nodiscard]] bool is_equal(host_udf_base const& other) const override
* {
* // Check if the other object is also instance of this class.
+ * // If there are internal state variables, they may need to be checked for equality as well.
* return dynamic_cast(&other) != nullptr;
* }
*
@@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf {
* };
* @endcode
*/
-struct host_udf_base {
- host_udf_base() = default;
- virtual ~host_udf_base() = default;
-
+struct segmented_reduce_host_udf : host_udf_base {
/**
- * @brief Define the possible data needed for groupby aggregations.
+ * @brief Perform segmented reduction operations.
*
- * Note that only sort-based groupby aggregations are supported.
+ * @param input The input column for reduction
+ * @param offsets A list of offsets defining the segments for reduction
+ * @param output_dtype The data type for the final output column
+ * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in
+ * the segment are valid, and if `EXCLUDE` then the reduction result is valid if any
+ * element in the segment is valid
+ * @param init The initial value of the reduction
+ * @param stream The CUDA stream to use for any kernel launches
+ * @param mr Device memory resource to use for any allocations
+ * @return The output result of the aggregation
*/
- enum class groupby_data_attribute : int32_t {
- INPUT_VALUES, ///< The input values column.
- GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the
- ///< values within each group maintain their original order.
- SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and
- ///< sorted within each group.
- NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys).
- GROUP_OFFSETS, ///< The offsets separating groups.
- GROUP_LABELS ///< Group labels (which is also the same as group indices).
- };
+ [[nodiscard]] virtual std::unique_ptr operator()(
+ column_view const& input,
+ device_span offsets,
+ data_type output_dtype,
+ null_policy null_handling,
+ std::optional> init,
+ rmm::cuda_stream_view stream,
+ rmm::device_async_resource_ref mr) const = 0;
+};
+// Forward declaration.
+namespace groupby ::detail {
+struct aggregate_result_functor;
+}
+
+/**
+ * @brief The interface for host-based UDF implementation for groupby aggregation context.
+ *
+ * An implementation of host-based UDF for groupby needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such a derived class must also define the functions `get_empty_output()` to return result when
+ * the input is empty, and ``operator()`` to perform its groupby operations.
+ *
+ * During execution, the derived class can access internal data provided by the libcudf groupby
+ * framework through a set of ``get*`` accessors, as well as calling other built-in groupby
+ * aggregations through the ``compute_aggregation`` function.
+ *
+ * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby
+ * aggregations require more complex data structure and is not yet supported.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::groupby_host_udf {
+ * my_udf_aggregation() = default;
+ *
+ * [[nodiscard]] std::unique_ptr get_empty_output(
+ * rmm::cuda_stream_view stream,
+ * rmm::device_async_resource_ref mr) const override
+ * {
+ * // Return a column corresponding to the result when the input values column is empty.
+ * }
+ *
+ * [[nodiscard]] std::unique_ptr operator()(
+ * rmm::cuda_stream_view stream,
+ * rmm::device_async_resource_ref mr) const override
+ * {
+ * // Perform UDF computation using the input data and return the result.
+ * }
+ *
+ * [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ * {
+ * // Check if the other object is also instance of this class.
+ * // If there are internal state variables, they may need to be checked for equality as well.
+ * return dynamic_cast(&other) != nullptr;
+ * }
+ *
+ * [[nodiscard]] std::unique_ptr clone() const override
+ * {
+ * return std::make_unique();
+ * }
+ * };
+ * @endcode
+ */
+struct groupby_host_udf : host_udf_base {
/**
- * @brief Describe possible data that may be needed in the derived class for its operations.
+ * @brief Get the output when the input values column is empty.
*
- * Such data can be either intermediate data such as sorted values or group labels etc, or the
- * results of other aggregations.
+ * This is called in libcudf when the input values column is empty. In such situations libcudf
+ * tries to generate the output directly without unnecessarily evaluating the intermediate data.
*
- * Each derived host-based UDF class may need a different set of data. It is inefficient to
- * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
- * the derived class can define a subset of data that it needs and libcudf will evaluate
- * and pass down only data requested from that set.
+ * @param stream The CUDA stream to use for any kernel launches
+ * @param mr Device memory resource to use for any allocations
+ * @return The output result of the aggregation when the input values column is empty
*/
- struct data_attribute {
- /**
- * @brief Hold all possible data types for the input of the aggregation in the derived class.
- */
- using value_type = std::variant>;
- value_type value; ///< The actual data attribute, wrapped by this struct
- ///< as a wrapper is needed to define `hash` and `equal_to` functors.
-
- data_attribute() = default; ///< Default constructor
- data_attribute(data_attribute&&) = default; ///< Move constructor
-
- /**
- * @brief Construct a new data attribute from an aggregation attribute.
- * @param value_ An aggregation attribute
- */
- template )>
- data_attribute(T value_) : value{value_}
- {
- }
-
- /**
- * @brief Construct a new data attribute from another aggregation request.
- * @param value_ An aggregation request
- */
- template ||
- std::is_same_v)>
- data_attribute(std::unique_ptr value_) : value{std::move(value_)}
- {
- CUDF_EXPECTS(std::get>(value) != nullptr,
- "Invalid aggregation request.");
- if constexpr (std::is_same_v) {
- CUDF_EXPECTS(
- dynamic_cast(std::get>(value).get()) != nullptr,
- "Requesting results from other aggregations is only supported in groupby "
- "aggregations.");
- }
- }
-
- /**
- * @brief Copy constructor.
- * @param other The other data attribute to copy from
- */
- data_attribute(data_attribute const& other);
-
- /**
- * @brief Hash functor for `data_attribute`.
- */
- struct hash {
- /**
- * @brief Compute the hash value of a data attribute.
- * @param attr The data attribute to hash
- * @return The hash value of the data attribute
- */
- std::size_t operator()(data_attribute const& attr) const;
- }; // struct hash
-
- /**
- * @brief Equality comparison functor for `data_attribute`.
- */
- struct equal_to {
- /**
- * @brief Check if two data attributes are equal.
- * @param lhs The left-hand side data attribute
- * @param rhs The right-hand side data attribute
- * @return True if the two data attributes are equal
- */
- bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
- }; // struct equal_to
- }; // struct data_attribute
+ [[nodiscard]] virtual std::unique_ptr get_empty_output(
+ rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
/**
- * @brief Set of attributes for the input data that is needed for computing the aggregation.
+ * @brief Perform the main groupby computation for the host-based UDF.
+ *
+ * @param stream The CUDA stream to use for any kernel launches
+ * @param mr Device memory resource to use for any allocations
+ * @return The output result of the aggregation
*/
- using data_attribute_set_t =
- std::unordered_set;
+ [[nodiscard]] virtual std::unique_ptr operator()(
+ rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
+
+ private:
+ // Allow the struct `aggregate_result_functor` to set its private callback variables.
+ friend struct groupby::detail::aggregate_result_functor;
/**
- * @brief Return a set of attributes for the data that is needed for computing the aggregation.
- *
- * The derived class should return the attributes corresponding to only the data that it needs to
- * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
- * empty set is returned. That means all the data attributes (except results from other
- * aggregations in groupby) will be needed.
- *
- * @return A set of `data_attribute`
+ * @brief Callback to access the input values column.
+ */
+ std::function callback_input_values;
+
+ /**
+ * @brief Callback to access the input values grouped according to the input keys for which the
+ * values within each group maintain their original order.
+ */
+ std::function callback_grouped_values;
+
+ /**
+ * @brief Callback to access the input values grouped according to the input keys and sorted
+ * within each group.
+ */
+ std::function callback_sorted_grouped_values;
+
+ /**
+ * @brief Callback to access the number of groups (i.e., number of distinct keys).
*/
- [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+ std::function callback_num_groups;
/**
- * @brief Hold all possible types of the data that is passed to the derived class for executing
- * the aggregation.
+ * @brief Callback to access the offsets separating groups.
*/
- using input_data_t = std::variant>;
+ std::function(void)> callback_group_offsets;
/**
- * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+ * @brief Callback to access the group labels (which is also the same as group indices).
*/
- using input_map_t = std::
- unordered_map;
+ std::function(void)> callback_group_labels;
/**
- * @brief Output type of the aggregation.
+ * @brief Callback to access the result from other groupby aggregations.
+ */
+ std::function)> callback_compute_aggregation;
+
+ protected:
+ /**
+ * @brief Access the input values column.
*
- * Currently only a single type is supported as the output of the aggregation, but it will hold
- * more type in the future when reduction is supported.
+ * @return The input values column.
*/
- using output_t = std::variant>;
+ [[nodiscard]] column_view get_input_values() const
+ {
+ CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values.");
+ return callback_input_values();
+ }
/**
- * @brief Get the output when the input values column is empty.
+ * @brief Access the input values grouped according to the input keys for which the values
+ * within each group maintain their original order.
*
- * This is called in libcudf when the input values column is empty. In such situations libcudf
- * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+ * @return The grouped values column.
+ */
+ [[nodiscard]] column_view get_grouped_values() const
+ {
+ CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values.");
+ return callback_grouped_values();
+ }
+
+ /**
+ * @brief Access the input values grouped according to the input keys and sorted within each
+ * group.
*
- * @param output_dtype The expected output data type
- * @param stream The CUDA stream to use for any kernel launches
- * @param mr Device memory resource to use for any allocations
- * @return The output result of the aggregation when input values is empty
+ * @return The sorted grouped values column.
*/
- [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype,
- rmm::cuda_stream_view stream,
- rmm::device_async_resource_ref mr) const = 0;
+ [[nodiscard]] column_view get_sorted_grouped_values() const
+ {
+ CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values.");
+ return callback_sorted_grouped_values();
+ }
/**
- * @brief Perform the main computation for the host-based UDF.
+ * @brief Access the number of groups (i.e., number of distinct keys).
*
- * @param input The input data needed for performing all computation
- * @param stream The CUDA stream to use for any kernel launches
- * @param mr Device memory resource to use for any allocations
- * @return The output result of the aggregation
+ * @return The number of groups.
*/
- [[nodiscard]] virtual output_t operator()(input_map_t const& input,
- rmm::cuda_stream_view stream,
- rmm::device_async_resource_ref mr) const = 0;
+ [[nodiscard]] size_type get_num_groups() const
+ {
+ CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups.");
+ return callback_num_groups();
+ }
/**
- * @brief Computes hash value of the class's instance.
- * @return The hash value of the instance
+ * @brief Access the offsets separating groups.
+ *
+ * @return The array of group offsets.
*/
- [[nodiscard]] virtual std::size_t do_hash() const
+ [[nodiscard]] device_span get_group_offsets() const
{
- return std::hash{}(static_cast(aggregation::Kind::HOST_UDF));
+ CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets.");
+ return callback_group_offsets();
}
/**
- * @brief Compares two instances of the derived class for equality.
- * @param other The other derived class's instance to compare with
- * @return True if the two instances are equal
+ * @brief Access the group labels (which is also the same as group indices).
+ *
+ * @return The array of group labels.
*/
- [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+ [[nodiscard]] device_span get_group_labels() const
+ {
+ CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels.");
+ return callback_group_labels();
+ }
/**
- * @brief Clones the instance.
+ * @brief Compute a built-in groupby aggregation and access its result.
*
- * A class derived from `host_udf_base` should not store too much data such that its instances
- * remain lightweight for efficient cloning.
+ * This allows the derived class to call any other built-in groupby aggregations on the same input
+ * values column and access the output for its operations.
*
- * @return A new instance cloned from this
+ * @param other_agg An arbitrary built-in groupby aggregation
+ * @return A `column_view` object corresponding to the output result of the given aggregation
*/
- [[nodiscard]] virtual std::unique_ptr clone() const = 0;
+ [[nodiscard]] column_view compute_aggregation(std::unique_ptr other_agg) const
+ {
+ CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation.");
+ return callback_compute_aggregation(std::move(other_agg));
+ }
};
/** @} */ // end of group
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 9d8762555d7..001b604814c 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -452,7 +452,7 @@ struct expression_evaluator {
++operator_index) {
// Execute operator
auto const op = plan.operators[operator_index];
- auto const arity = ast_operator_arity(op);
+ auto const arity = plan.operator_arities[operator_index];
if (arity == 1) {
// Unary operator
auto const& input =
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index b5973d0ace9..d2e8c1cd41f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
#include
#include
+#include
#include
#include
#include
@@ -88,6 +89,7 @@ struct expression_device_view {
device_span data_references;
device_span literals;
device_span operators;
+ device_span operator_arities;
device_span operator_source_indices;
cudf::size_type num_intermediates;
};
@@ -229,39 +231,55 @@ class expression_parser {
* @param[in] v The `std::vector` containing components (operators, literals, etc).
* @param[in,out] sizes The `std::vector` containing the size of each data buffer.
* @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer.
+ * @param[in,out] alignment The maximum alignment needed for all the extracted size and pointers
*/
template
void extract_size_and_pointer(std::vector const& v,
std::vector& sizes,
- std::vector& data_pointers)
+ std::vector& data_pointers,
+ cudf::size_type& alignment)
{
+ // sub-type alignment will only work provided the alignment is lesser or equal to
+ // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers
+ static_assert(alignof(T) <= alignof(max_align_t));
auto const data_size = sizeof(T) * v.size();
sizes.push_back(data_size);
data_pointers.push_back(v.data());
+ alignment = std::max(alignment, static_cast(alignof(T)));
}
void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
{
std::vector sizes;
std::vector data_pointers;
+ // use a minimum of 4-byte alignment
+ cudf::size_type buffer_alignment = 4;
- extract_size_and_pointer(_data_references, sizes, data_pointers);
- extract_size_and_pointer(_literals, sizes, data_pointers);
- extract_size_and_pointer(_operators, sizes, data_pointers);
- extract_size_and_pointer(_operator_source_indices, sizes, data_pointers);
+ extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment);
+ extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment);
+ extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment);
+ extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment);
+ extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment);
// Create device buffer
- auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
- auto buffer_offsets = std::vector(sizes.size());
- thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+ auto buffer_offsets = std::vector(sizes.size());
+ thrust::exclusive_scan(sizes.cbegin(),
+ sizes.cend(),
+ buffer_offsets.begin(),
+ cudf::size_type{0},
+ [buffer_alignment](auto a, auto b) {
+ // align each component of the AST program
+ return cudf::util::round_up_safe(a + b, buffer_alignment);
+ });
+
+ auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back());
+ auto host_data_buffer = std::vector(buffer_size);
- auto h_data_buffer = std::vector(buffer_size);
for (unsigned int i = 0; i < data_pointers.size(); ++i) {
- std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
+ std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
}
- _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr);
-
+ _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr);
stream.synchronize();
// Create device pointers to components of plan
@@ -277,8 +295,11 @@ class expression_parser {
device_expression_data.operators = device_span(
reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]),
_operators.size());
- device_expression_data.operator_source_indices = device_span(
+ device_expression_data.operator_arities = device_span(
reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]),
+ _operators.size());
+ device_expression_data.operator_source_indices = device_span(
+ reinterpret_cast(device_data_buffer_ptr + buffer_offsets[4]),
_operator_source_indices.size());
device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
shmem_per_thread = static_cast(
@@ -322,6 +343,7 @@ class expression_parser {
bool _has_nulls;
std::vector _data_references;
std::vector _operators;
+ std::vector _operator_arities;
std::vector _operator_source_indices;
std::vector _literals;
};
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 46507700e21..db04e1fe989 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v;
* @param args Forwarded arguments to `operator()` of `f`.
*/
template
-CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op,
+ F&& f,
+ Ts&&... args)
{
switch (op) {
case ast_operator::ADD:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::SUB:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::MUL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::DIV:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::TRUE_DIV:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::FLOOR_DIV:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::MOD:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::PYMOD:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::POW:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::EQUAL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::NULL_EQUAL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::NOT_EQUAL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::LESS:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::GREATER:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::LESS_EQUAL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::GREATER_EQUAL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::BITWISE_AND:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::BITWISE_OR:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::BITWISE_XOR:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::LOGICAL_AND:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::NULL_LOGICAL_AND:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::LOGICAL_OR:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::NULL_LOGICAL_OR:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::IDENTITY:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::IS_NULL:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::SIN:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::COS:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::TAN:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCSIN:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCCOS:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCTAN:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::SINH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::COSH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::TANH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCSINH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCCOSH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::ARCTANH:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()(std::forward(args)...);
case ast_operator::EXP:
- f.template operator()(std::forward(args)...);
- break;
+ return f.template operator()