From 4ec389b4c515e4b3b85d6fd28b1471b1f1de830d Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:27:13 -0800
Subject: [PATCH 1/4] Implement `HOST_UDF` aggregation for reduction and
 segmented reduction (#17645)

Following https://github.com/rapidsai/cudf/pull/17592, this enables `HOST_UDF` aggregation in reduction and segmented reduction, allowing to execute a host-side user-defined function (UDF) through libcudf aggregation framework.

Closes https://github.com/rapidsai/cudf/issues/16633.

Authors:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Yunsong Wang (https://github.com/PointKernel)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/17645
---
 cpp/include/cudf/aggregation.hpp              |   4 +-
 cpp/include/cudf/aggregation/host_udf.hpp     | 478 +++++++++++-------
 .../cudf/detail/aggregation/aggregation.hpp   |   6 +-
 cpp/src/groupby/groupby.cu                    |   9 +-
 cpp/src/groupby/sort/aggregate.cpp            |  81 ++-
 cpp/src/groupby/sort/host_udf_aggregation.cpp |  48 +-
 cpp/src/reductions/reductions.cpp             |  16 +-
 cpp/src/reductions/segmented/reductions.cpp   |  17 +-
 cpp/tests/CMakeLists.txt                      |   3 +-
 cpp/tests/groupby/host_udf_example_tests.cu   |  75 +--
 cpp/tests/groupby/host_udf_tests.cpp          | 245 ++++-----
 .../reductions/host_udf_example_tests.cu      | 422 ++++++++++++++++
 .../main/java/ai/rapids/cudf/Aggregation.java |   2 +-
 .../ai/rapids/cudf/GroupByAggregation.java    |   2 +-
 .../ai/rapids/cudf/ReductionAggregation.java  |  17 +-
 .../cudf/SegmentedReductionAggregation.java   |  11 +-
 16 files changed, 941 insertions(+), 495 deletions(-)
 create mode 100644 cpp/tests/reductions/host_udf_example_tests.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a1b7db5e08a..2b2a660bed7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -601,7 +601,7 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            data_type output_type);
 
 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
-struct host_udf_base;
+class host_udf_base;
 
 /**
  * @brief Factory to create a HOST_UDF aggregation.
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
index bbce76dc5f3..451d75137e4 100644
--- a/cpp/include/cudf/aggregation/host_udf.hpp
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,16 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <functional>
 #include <optional>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
 
 /**
  * @file host_udf.hpp
@@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief The interface for host-based UDF implementation.
+ * @brief The fundamental interface for host-based UDF implementation.
  *
- * An implementation of host-based UDF needs to be derived from this base class, defining
- * its own version of the required functions. In particular:
- *  - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
- *    and `clone` functions.
- *  - If necessary, the derived class can also override `do_hash` to compute hashing for its
- *    instance, and `get_required_data` to selectively access to the input data as well as
- *    intermediate data provided by libcudf.
+ * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in
+ * the users' UDF implementation. These functions are required for libcudf aggregation framework
+ * to perform its operations.
+ */
+class host_udf_base {
+  // Declare constructor private to prevent the users from deriving from this class.
+ private:
+  host_udf_base() = default;  ///< Default constructor
+
+  // Only allow deriving from the structs below.
+  friend struct reduce_host_udf;
+  friend struct segmented_reduce_host_udf;
+  friend struct groupby_host_udf;
+
+ public:
+  virtual ~host_udf_base() = default;  ///< Default destructor
+
+  /**
+   * @brief Computes hash value of the instance.
+   *
+   * Overriding this function is optional when the derived class has data members such that
+   * each instance needs to be differentiated from each other.
+   *
+   * @return The hash value of the instance
+   */
+  [[nodiscard]] virtual std::size_t do_hash() const
+  {
+    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+  }
+
+  /**
+   * @brief Compares two instances of the derived class for equality.
+   * @param other The other instance to compare with
+   * @return True if the two instances are equal
+   */
+  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+  /**
+   * @brief Clones the instance.
+   *
+   * The instances of the derived class should be lightweight for efficient cloning.
+   *
+   * @return A new instance cloned from this one
+   */
+  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for reduction contexts.
+ *
+ * An implementation of host-based UDF for reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived classes must also define the `operator()` function to perform reduction
+ * operations.
  *
- * Example of such implementation:
+ * Example:
  * @code{.cpp}
- * struct my_udf_aggregation : cudf::host_udf_base {
+ * struct my_udf_aggregation : cudf::reduce_host_udf {
  *   my_udf_aggregation() = default;
  *
- *   // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
- *   // and the result from groupby `MAX` aggregation.
- *   [[nodiscard]] data_attribute_set_t get_required_data() const override
+ *   [[nodiscard]] std::unique_ptr<scalar> operator()(
+ *     column_view const& input,
+ *     data_type output_dtype,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
  *   {
- *       return {groupby_data_attribute::GROUPED_VALUES,
- *               groupby_data_attribute::GROUP_OFFSETS,
- *               cudf::make_max_aggregation<cudf::groupby_aggregation>()};
+ *     // Perform reduction computation using the input data and return the reduction result.
+ *     // This is where the actual reduction logic is implemented.
  *   }
  *
- *   [[nodiscard]] output_t get_empty_output(
- *     [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
- *     [[maybe_unused]] rmm::cuda_stream_view stream,
- *     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
- *     // This UDF aggregation always returns a column of type INT32.
- *     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
- *   [[nodiscard]] output_t operator()(input_map_t const& input,
- *                                     rmm::cuda_stream_view stream,
- *                                     rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
  *   {
- *     // Perform UDF computation using the input data and return the result.
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct reduce_host_udf : host_udf_base {
+  /**
+   * @brief Perform reduction operations.
+   *
+   * @param input The input column for reduction
+   * @param output_dtype The data type for the final output scalar
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
+   */
+  [[nodiscard]] virtual std::unique_ptr<scalar> operator()(
+    column_view const& input,
+    data_type output_dtype,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for segmented reduction context.
+ *
+ * An implementation of host-based UDF for segmented reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived class must also define the `operator()` function to perform segmented reduction.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::segmented_reduce_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     column_view const& input,
+ *     device_span<size_type const> offsets,
+ *     data_type output_dtype,
+ *     null_policy null_handling,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform computation using the input data and return the result.
+ *     // This is where the actual segmented reduction logic is implemented.
  *   }
  *
  *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
  *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
  *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
@@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf {
  * };
  * @endcode
  */
-struct host_udf_base {
-  host_udf_base()          = default;
-  virtual ~host_udf_base() = default;
-
+struct segmented_reduce_host_udf : host_udf_base {
   /**
-   * @brief Define the possible data needed for groupby aggregations.
+   * @brief Perform segmented reduction operations.
    *
-   * Note that only sort-based groupby aggregations are supported.
+   * @param input The input column for reduction
+   * @param offsets A list of offsets defining the segments for reduction
+   * @param output_dtype The data type for the final output column
+   * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in
+   *        the segment are valid, and if `EXCLUDE` then the reduction result is valid if any
+   *        element in the segment is valid
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  enum class groupby_data_attribute : int32_t {
-    INPUT_VALUES,    ///< The input values column.
-    GROUPED_VALUES,  ///< The input values grouped according to the input `keys` for which the
-                     ///< values within each group maintain their original order.
-    SORTED_GROUPED_VALUES,  ///< The input values grouped according to the input `keys` and
-                            ///< sorted within each group.
-    NUM_GROUPS,             ///< The number of groups (i.e., number of distinct keys).
-    GROUP_OFFSETS,          ///< The offsets separating groups.
-    GROUP_LABELS            ///< Group labels (which is also the same as group indices).
-  };
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    column_view const& input,
+    device_span<size_type const> offsets,
+    data_type output_dtype,
+    null_policy null_handling,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
 
+// Forward declaration.
+namespace groupby ::detail {
+struct aggregate_result_functor;
+}
+
+/**
+ * @brief The interface for host-based UDF implementation for groupby aggregation context.
+ *
+ * An implementation of host-based UDF for groupby needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such a derived class must also define the functions `get_empty_output()` to return result when
+ * the input is empty, and ``operator()`` to perform its groupby operations.
+ *
+ * During execution, the derived class can access internal data provided by the libcudf groupby
+ * framework through a set of ``get*`` accessors, as well as calling other built-in groupby
+ * aggregations through the ``compute_aggregation`` function.
+ *
+ * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby
+ * aggregations require more complex data structure and is not yet supported.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::groupby_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> get_empty_output(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Return a column corresponding to the result when the input values column is empty.
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform UDF computation using the input data and return the result.
+ *   }
+ *
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ *   {
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+ *   {
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct groupby_host_udf : host_udf_base {
   /**
-   * @brief Describe possible data that may be needed in the derived class for its operations.
+   * @brief Get the output when the input values column is empty.
    *
-   * Such data can be either intermediate data such as sorted values or group labels etc, or the
-   * results of other aggregations.
+   * This is called in libcudf when the input values column is empty. In such situations libcudf
+   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
    *
-   * Each derived host-based UDF class may need a different set of data. It is inefficient to
-   * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
-   * the derived class can define a subset of data that it needs and libcudf will evaluate
-   * and pass down only data requested from that set.
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation when the input values column is empty
    */
-  struct data_attribute {
-    /**
-     * @brief Hold all possible data types for the input of the aggregation in the derived class.
-     */
-    using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
-    value_type value;  ///< The actual data attribute, wrapped by this struct
-                       ///< as a wrapper is needed to define `hash` and `equal_to` functors.
-
-    data_attribute()                 = default;  ///< Default constructor
-    data_attribute(data_attribute&&) = default;  ///< Move constructor
-
-    /**
-     * @brief Construct a new data attribute from an aggregation attribute.
-     * @param value_ An aggregation attribute
-     */
-    template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
-    data_attribute(T value_) : value{value_}
-    {
-    }
-
-    /**
-     * @brief Construct a new data attribute from another aggregation request.
-     * @param value_ An aggregation request
-     */
-    template <typename T,
-              CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
-                             std::is_same_v<T, groupby_aggregation>)>
-    data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
-    {
-      CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
-                   "Invalid aggregation request.");
-      if constexpr (std::is_same_v<T, aggregation>) {
-        CUDF_EXPECTS(
-          dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
-          "Requesting results from other aggregations is only supported in groupby "
-          "aggregations.");
-      }
-    }
-
-    /**
-     * @brief Copy constructor.
-     * @param other The other data attribute to copy from
-     */
-    data_attribute(data_attribute const& other);
-
-    /**
-     * @brief Hash functor for `data_attribute`.
-     */
-    struct hash {
-      /**
-       * @brief Compute the hash value of a data attribute.
-       * @param attr The data attribute to hash
-       * @return The hash value of the data attribute
-       */
-      std::size_t operator()(data_attribute const& attr) const;
-    };  // struct hash
-
-    /**
-     * @brief Equality comparison functor for `data_attribute`.
-     */
-    struct equal_to {
-      /**
-       * @brief Check if two data attributes are equal.
-       * @param lhs The left-hand side data attribute
-       * @param rhs The right-hand side data attribute
-       * @return True if the two data attributes are equal
-       */
-      bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
-    };  // struct equal_to
-  };    // struct data_attribute
+  [[nodiscard]] virtual std::unique_ptr<column> get_empty_output(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
 
   /**
-   * @brief Set of attributes for the input data that is needed for computing the aggregation.
+   * @brief Perform the main groupby computation for the host-based UDF.
+   *
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  using data_attribute_set_t =
-    std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
+
+ private:
+  // Allow the struct `aggregate_result_functor` to set its private callback variables.
+  friend struct groupby::detail::aggregate_result_functor;
 
   /**
-   * @brief Return a set of attributes for the data that is needed for computing the aggregation.
-   *
-   * The derived class should return the attributes corresponding to only the data that it needs to
-   * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
-   * empty set is returned. That means all the data attributes (except results from other
-   * aggregations in groupby) will be needed.
-   *
-   * @return A set of `data_attribute`
+   * @brief Callback to access the input values column.
+   */
+  std::function<column_view(void)> callback_input_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys for which the
+   * values within each group maintain their original order.
+   */
+  std::function<column_view(void)> callback_grouped_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys and sorted
+   * within each group.
+   */
+  std::function<column_view(void)> callback_sorted_grouped_values;
+
+  /**
+   * @brief Callback to access the number of groups (i.e., number of distinct keys).
    */
-  [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+  std::function<size_type(void)> callback_num_groups;
 
   /**
-   * @brief Hold all possible types of the data that is passed to the derived class for executing
-   * the aggregation.
+   * @brief Callback to access the offsets separating groups.
    */
-  using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
+  std::function<device_span<size_type const>(void)> callback_group_offsets;
 
   /**
-   * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+   * @brief Callback to access the group labels (which is also the same as group indices).
    */
-  using input_map_t = std::
-    unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
+  std::function<device_span<size_type const>(void)> callback_group_labels;
 
   /**
-   * @brief Output type of the aggregation.
+   * @brief Callback to access the result from other groupby aggregations.
+   */
+  std::function<column_view(std::unique_ptr<aggregation>)> callback_compute_aggregation;
+
+ protected:
+  /**
+   * @brief Access the input values column.
    *
-   * Currently only a single type is supported as the output of the aggregation, but it will hold
-   * more type in the future when reduction is supported.
+   * @return The input values column.
    */
-  using output_t = std::variant<std::unique_ptr<column>>;
+  [[nodiscard]] column_view get_input_values() const
+  {
+    CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values.");
+    return callback_input_values();
+  }
 
   /**
-   * @brief Get the output when the input values column is empty.
+   * @brief Access the input values grouped according to the input keys for which the values
+   * within each group maintain their original order.
    *
-   * This is called in libcudf when the input values column is empty. In such situations libcudf
-   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+   * @return The grouped values column.
+   */
+  [[nodiscard]] column_view get_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values.");
+    return callback_grouped_values();
+  }
+
+  /**
+   * @brief Access the input values grouped according to the input keys and sorted within each
+   * group.
    *
-   * @param output_dtype The expected output data type
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation when input values is empty
+   * @return The sorted grouped values column.
    */
-  [[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] column_view get_sorted_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values.");
+    return callback_sorted_grouped_values();
+  }
 
   /**
-   * @brief Perform the main computation for the host-based UDF.
+   * @brief Access the number of groups (i.e., number of distinct keys).
    *
-   * @param input The input data needed for performing all computation
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation
+   * @return The number of groups.
    */
-  [[nodiscard]] virtual output_t operator()(input_map_t const& input,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] size_type get_num_groups() const
+  {
+    CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups.");
+    return callback_num_groups();
+  }
 
   /**
-   * @brief Computes hash value of the class's instance.
-   * @return The hash value of the instance
+   * @brief Access the offsets separating groups.
+   *
+   * @return The array of group offsets.
    */
-  [[nodiscard]] virtual std::size_t do_hash() const
+  [[nodiscard]] device_span<size_type const> get_group_offsets() const
   {
-    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+    CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets.");
+    return callback_group_offsets();
   }
 
   /**
-   * @brief Compares two instances of the derived class for equality.
-   * @param other The other derived class's instance to compare with
-   * @return True if the two instances are equal
+   * @brief Access the group labels (which is also the same as group indices).
+   *
+   * @return The array of group labels.
    */
-  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+  [[nodiscard]] device_span<size_type const> get_group_labels() const
+  {
+    CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels.");
+    return callback_group_labels();
+  }
 
   /**
-   * @brief Clones the instance.
+   * @brief Compute a built-in groupby aggregation and access its result.
    *
-   * A class derived from `host_udf_base` should not store too much data such that its instances
-   * remain lightweight for efficient cloning.
+   * This allows the derived class to call any other built-in groupby aggregations on the same input
+   * values column and access the output for its operations.
    *
-   * @return A new instance cloned from this
+   * @param other_agg An arbitrary built-in groupby aggregation
+   * @return A `column_view` object corresponding to the output result of the given aggregation
    */
-  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+  [[nodiscard]] column_view compute_aggregation(std::unique_ptr<aggregation> other_agg) const
+  {
+    CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation.");
+    return callback_compute_aggregation(std::move(other_agg));
+  }
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d873e93bd20..5574ed6ea6e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying host-based UDF aggregation.
  */
-class host_udf_aggregation final : public groupby_aggregation {
+class host_udf_aggregation final : public groupby_aggregation,
+                                   public reduce_aggregation,
+                                   public segmented_reduce_aggregation {
  public:
   std::unique_ptr<host_udf_base> udf_ptr;
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 4c90cd0eef5..6234148e9fa 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,8 +145,11 @@ struct empty_column_constructor {
     }
 
     if constexpr (k == aggregation::Kind::HOST_UDF) {
-      auto const& udf_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-      return std::get<std::unique_ptr<column>>(udf_ptr->get_empty_output(std::nullopt, stream, mr));
+      auto const& udf_base_ptr =
+        dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+      auto const udf_ptr = dynamic_cast<groupby_host_udf const*>(udf_base_ptr.get());
+      CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
+      return udf_ptr->get_empty_output(stream, mr);
     }
 
     return make_empty_column(target_type(values.type(), k));
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 6480070e85a..fb3f7559d64 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -795,58 +795,41 @@ void aggregate_result_functor::operator()<aggregation::HOST_UDF>(aggregation con
 {
   if (cache.has_result(values, agg)) { return; }
 
-  auto const& udf_ptr   = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-  auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t {
-    if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; }
-    // Empty attribute set means everything.
-    return {host_udf_base::groupby_data_attribute::INPUT_VALUES,
-            host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::NUM_GROUPS,
-            host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-            host_udf_base::groupby_data_attribute::GROUP_LABELS};
-  }();
+  auto const& udf_base_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+  auto const udf_ptr       = dynamic_cast<groupby_host_udf*>(udf_base_ptr.get());
+  CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
 
-  // Do not cache udf_input, as the actual input data may change from run to run.
-  host_udf_base::input_map_t udf_input;
-  for (auto const& attr : data_attrs) {
-    CUDF_EXPECTS(std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value) ||
-                   std::holds_alternative<std::unique_ptr<aggregation>>(attr.value),
-                 "Invalid input data attribute for HOST_UDF groupby aggregation.");
-    if (std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value)) {
-      switch (std::get<host_udf_base::groupby_data_attribute>(attr.value)) {
-        case host_udf_base::groupby_data_attribute::INPUT_VALUES:
-          udf_input.emplace(attr, values);
-          break;
-        case host_udf_base::groupby_data_attribute::GROUPED_VALUES:
-          udf_input.emplace(attr, get_grouped_values());
-          break;
-        case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES:
-          udf_input.emplace(attr, get_sorted_values());
-          break;
-        case host_udf_base::groupby_data_attribute::NUM_GROUPS:
-          udf_input.emplace(attr, helper.num_groups(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_OFFSETS:
-          udf_input.emplace(attr, helper.group_offsets(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_LABELS:
-          udf_input.emplace(attr, helper.group_labels(stream));
-          break;
-        default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation.");
-      }
-    } else {  // data is result from another aggregation
-      auto other_agg = std::get<std::unique_ptr<aggregation>>(attr.value)->clone();
+  if (!udf_ptr->callback_input_values) {
+    udf_ptr->callback_input_values = [&]() -> column_view { return values; };
+  }
+  if (!udf_ptr->callback_grouped_values) {
+    udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); };
+  }
+  if (!udf_ptr->callback_sorted_grouped_values) {
+    udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); };
+  }
+  if (!udf_ptr->callback_num_groups) {
+    udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); };
+  }
+  if (!udf_ptr->callback_group_offsets) {
+    udf_ptr->callback_group_offsets = [&]() -> device_span<size_type const> {
+      return helper.group_offsets(stream);
+    };
+  }
+  if (!udf_ptr->callback_group_labels) {
+    udf_ptr->callback_group_labels = [&]() -> device_span<size_type const> {
+      return helper.group_labels(stream);
+    };
+  }
+  if (!udf_ptr->callback_compute_aggregation) {
+    udf_ptr->callback_compute_aggregation =
+      [&](std::unique_ptr<aggregation> other_agg) -> column_view {
       cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg);
-      auto result = cache.get_result(values, *other_agg);
-      udf_input.emplace(std::move(other_agg), std::move(result));
-    }
+      return cache.get_result(values, *other_agg);
+    };
   }
 
-  auto output = (*udf_ptr)(udf_input, stream, mr);
-  CUDF_EXPECTS(std::holds_alternative<std::unique_ptr<column>>(output),
-               "Invalid output type from HOST_UDF groupby aggregation.");
-  cache.add_result(values, agg, std::get<std::unique_ptr<column>>(std::move(output)));
+  cache.add_result(values, agg, (*udf_ptr)(stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp
index 0da47e17f48..6f1fe80c4bd 100644
--- a/cpp/src/groupby/sort/host_udf_aggregation.cpp
+++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,51 +16,9 @@
 
 #include <cudf/aggregation/host_udf.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 
 namespace cudf {
 
-host_udf_base::data_attribute::data_attribute(data_attribute const& other)
-  : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; },
-                                                    [](std::unique_ptr<aggregation> const& val) {
-                                                      return value_type{val->clone()};
-                                                    }},
-                     other.value)}
-{
-}
-
-std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const
-{
-  auto const hash_value =
-    std::visit(cudf::detail::visitor_overload{
-                 [](auto const& val) { return std::hash<int>{}(static_cast<int>(val)); },
-                 [](std::unique_ptr<aggregation> const& val) { return val->do_hash(); }},
-               attr.value);
-  return std::hash<std::size_t>{}(attr.value.index()) ^ hash_value;
-}
-
-bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs,
-                                                         data_attribute const& rhs) const
-{
-  auto const& lhs_val = lhs.value;
-  auto const& rhs_val = rhs.value;
-  if (lhs_val.index() != rhs_val.index()) { return false; }
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [](auto const& lhs_val, auto const& rhs_val) {
-        if constexpr (std::is_same_v<decltype(lhs_val), decltype(rhs_val)>) {
-          return lhs_val == rhs_val;
-        } else {
-          return false;
-        }
-      },
-      [](std::unique_ptr<aggregation> const& lhs_val, std::unique_ptr<aggregation> const& rhs_val) {
-        return lhs_val->is_equal(*rhs_val);
-      }},
-    lhs_val,
-    rhs_val);
-}
-
 namespace detail {
 
 host_udf_aggregation::host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
@@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr<aggregation> make_host_udf_aggregation<aggr
   std::unique_ptr<host_udf_base>);
 template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
   make_host_udf_aggregation<groupby_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_host_udf_aggregation<reduce_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
+  make_host_udf_aggregation<segmented_reduce_aggregation>(std::unique_ptr<host_udf_base>);
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 75ebc078930..928625a7e8f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,6 +145,13 @@ struct reduce_dispatch_functor {
         auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction.");
+        return (*udf_ptr)(col, output_dtype, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported reduction operator");
     }
   }
@@ -161,9 +169,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   // Returns default scalar if input column is empty or all null
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1c3a2b0c0f3..5835bfcf0a1 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor {
       }
       case segmented_reduce_aggregation::NUNIQUE:
         return segmented_nunique(col, offsets, null_handling, stream, mr);
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<segmented_reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction.");
+        return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
@@ -117,9 +126,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   if (segmented_values.is_empty() && offsets.empty()) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 344979e1288..35877ac34b9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -220,11 +220,12 @@ ConfigureTest(
   REDUCTIONS_TEST
   reductions/collect_ops_tests.cpp
   reductions/ewm_tests.cpp
+  reductions/host_udf_example_tests.cu
+  reductions/list_rank_test.cpp
   reductions/rank_tests.cpp
   reductions/reduction_tests.cpp
   reductions/scan_tests.cpp
   reductions/segmented_reduction_tests.cpp
-  reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
   GPUS 1
   PERCENT 70
diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu
index a454bd692fc..e1ded37d8a7 100644
--- a/cpp/tests/groupby/host_udf_example_tests.cu
+++ b/cpp/tests/groupby/host_udf_example_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/reduction.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -34,6 +32,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+
 namespace {
 /**
  * @brief A host-based UDF implementation for groupby.
@@ -41,42 +42,21 @@ namespace {
  * For each group of values, the aggregation computes
  * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`.
  */
-struct host_udf_groupby_example : cudf::host_udf_base {
+struct host_udf_groupby_example : cudf::groupby_host_udf {
   host_udf_groupby_example() = default;
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override
-  {
-    // We need grouped values, group offsets, group labels, and also results from groups'
-    // MAX and SUM aggregations.
-    return {groupby_data_attribute::GROUPED_VALUES,
-            groupby_data_attribute::GROUP_OFFSETS,
-            groupby_data_attribute::GROUP_LABELS,
-            cudf::make_max_aggregation<cudf::groupby_aggregation>(),
-            cudf::make_sum_aggregation<cudf::groupby_aggregation>()};
-  }
-
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
-    [[maybe_unused]] rmm::cuda_stream_view stream,
-    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
+    rmm::cuda_stream_view, rmm::device_async_resource_ref) const override
   {
     return cudf::make_empty_column(
       cudf::data_type{cudf::type_to_id<typename groupby_fn::OutputType>()});
   }
 
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
   {
-    auto const& values =
-      std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-    return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr);
-  }
-
-  [[nodiscard]] std::size_t do_hash() const override
-  {
-    // Just return the same hash for all instances of this class.
-    return std::size_t{12345};
+    auto const values = get_grouped_values();
+    return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr);
   }
 
   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
@@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
   struct groupby_fn {
     // Store pointer to the parent class so we can call its functions.
-    host_udf_groupby_example const* parent;
+    host_udf_groupby_example const& parent;
 
-    // For simplicity, this example only accepts double input and always produces double output.
+    // For simplicity, this example only accepts a single type input and output.
     using InputType  = double;
     using OutputType = double;
 
     template <typename T, typename... Args, CUDF_ENABLE_IF(!std::is_same_v<InputType, T>)>
-    output_t operator()(Args...) const
+    std::unique_ptr<cudf::column> operator()(Args...) const
     {
       CUDF_FAIL("Unsupported input type.");
     }
 
     template <typename T, CUDF_ENABLE_IF(std::is_same_v<InputType, T>)>
-    output_t operator()(input_map_t const& input,
-                        rmm::cuda_stream_view stream,
-                        rmm::device_async_resource_ref mr) const
+    std::unique_ptr<cudf::column> operator()(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr) const
     {
-      auto const& values =
-        std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-      if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); }
+      auto const values = parent.get_grouped_values();
+      if (values.size() == 0) { return parent.get_empty_output(stream, mr); }
 
-      auto const offsets = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_OFFSETS));
+      auto const offsets = parent.get_group_offsets();
       CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
       auto const num_groups    = static_cast<int>(offsets.size()) - 1;
-      auto const group_indices = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_LABELS));
-      auto const group_max = std::get<cudf::column_view>(
-        input.at(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
-      auto const group_sum = std::get<cudf::column_view>(
-        input.at(cudf::make_sum_aggregation<cudf::groupby_aggregation>()));
+      auto const group_indices = parent.get_group_labels();
+      auto const group_max =
+        parent.compute_aggregation(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      auto const group_sum =
+        parent.compute_aggregation(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
       auto const values_dv_ptr = cudf::column_device_view::create(values, stream);
       auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<OutputType>()},
@@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
 }  // namespace
 
-using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
-using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
-
 struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {};
 
 TEST_F(HostUDFGroupbyExampleTest, SimpleInput)
diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp
index 1a0f68c0c6c..17da28cdefc 100644
--- a/cpp/tests/groupby/host_udf_tests.cpp
+++ b/cpp/tests/groupby/host_udf_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,178 +26,121 @@
 #include <vector>
 
 namespace {
+
 /**
- * @brief A host-based UDF implementation used for unit tests.
+ * @brief Generate a random aggregation object from {min, max, sum, product}.
  */
-struct host_udf_test_base : cudf::host_udf_base {
+std::unique_ptr<cudf::aggregation> get_random_agg()
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> distr(1, 4);
+  switch (distr(gen)) {
+    case 1: return cudf::make_min_aggregation();
+    case 2: return cudf::make_max_aggregation();
+    case 3: return cudf::make_sum_aggregation();
+    case 4: return cudf::make_product_aggregation();
+    default: CUDF_UNREACHABLE("This should not be reached.");
+  }
+  return nullptr;
+}
+
+/**
+ * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
+ */
+struct host_udf_groupby_test : cudf::groupby_host_udf {
   int test_location_line;  // the location where testing is called
   bool* test_run;          // to check if the test is accidentally skipped
-  data_attribute_set_t input_attrs;
+  bool test_other_agg;     // test calling other aggregation
 
-  host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_)
-    : test_location_line{test_location_line_},
-      test_run{test_run_},
-      input_attrs(std::move(input_attrs_))
+  host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_)
+    : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_}
   {
   }
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; }
-
-  // This is the main testing function, which checks for the correctness of input data.
-  // The rests are just to satisfy the interface.
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::size_t do_hash() const override { return 0; }
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
   {
-    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
-
-    test_data_attributes(input, stream, mr);
-
-    *test_run = true;  // test is run successfully
-    return get_empty_output(std::nullopt, stream, mr);
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_groupby_test const*>(&other) != nullptr;
+  }
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, test_other_agg);
   }
 
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
     [[maybe_unused]] rmm::cuda_stream_view stream,
     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
   {
-    // Unused function - dummy output.
+    // Dummy output.
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
   }
 
-  [[nodiscard]] std::size_t do_hash() const override { return 0; }
-  [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; }
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
+  {
+    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
 
-  // The main test function, which must be implemented for each kind of aggregations
-  // (groupby/reduction/segmented_reduction).
-  virtual void test_data_attributes(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const = 0;
-};
+    // Perform tests on types for the groupby data: we must ensure the data corresponding to each
+    // `groupby_data` enum having the correct type.
 
-/**
- * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
- */
-struct host_udf_groupby_test : host_udf_test_base {
-  host_udf_groupby_test(int test_location_line_,
-                        bool* test_run_,
-                        data_attribute_set_t input_attrs_ = {})
-    : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_))
-  {
-  }
+    {
+      auto const inp_data = get_input_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
-  {
-    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, input_attrs);
-  }
+    {
+      auto const inp_data = get_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  void test_data_attributes(input_map_t const& input,
-                            rmm::cuda_stream_view stream,
-                            rmm::device_async_resource_ref mr) const override
-  {
-    data_attribute_set_t check_attrs = input_attrs;
-    if (check_attrs.empty()) {
-      check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES,
-                                         groupby_data_attribute::GROUPED_VALUES,
-                                         groupby_data_attribute::SORTED_GROUPED_VALUES,
-                                         groupby_data_attribute::NUM_GROUPS,
-                                         groupby_data_attribute::GROUP_OFFSETS,
-                                         groupby_data_attribute::GROUP_LABELS};
+    {
+      auto const inp_data = get_sorted_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
     }
-    EXPECT_EQ(input.size(), check_attrs.size());
-    for (auto const& attr : check_attrs) {
-      EXPECT_TRUE(input.count(attr) > 0);
-      EXPECT_TRUE(std::holds_alternative<groupby_data_attribute>(attr.value) ||
-                  std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value));
-      if (std::holds_alternative<groupby_data_attribute>(attr.value)) {
-        switch (std::get<groupby_data_attribute>(attr.value)) {
-          case groupby_data_attribute::INPUT_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::SORTED_GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::NUM_GROUPS:
-            EXPECT_TRUE(std::holds_alternative<cudf::size_type>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_OFFSETS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_LABELS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          default:;
-        }
-      } else {  // std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value)
-        EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-      }
+
+    {
+      auto const inp_data = get_num_groups();
+      EXPECT_TRUE((std::is_same_v<cudf::size_type, std::decay_t<decltype(inp_data)>>));
     }
-  }
-};
 
-/**
- * @brief Get a random subset of input data attributes.
- */
-cudf::host_udf_base::data_attribute_set_t get_subset(
-  cudf::host_udf_base::data_attribute_set_t const& attrs)
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<std::size_t> size_distr(1, attrs.size() - 1);
-  auto const subset_size = size_distr(gen);
-  auto const elements =
-    std::vector<cudf::host_udf_base::data_attribute>(attrs.begin(), attrs.end());
-  std::uniform_int_distribution<std::size_t> idx_distr(0, attrs.size() - 1);
-  cudf::host_udf_base::data_attribute_set_t output;
-  while (output.size() < subset_size) {
-    output.insert(elements[idx_distr(gen)]);
-  }
-  return output;
-}
+    {
+      auto const inp_data = get_group_offsets();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
 
-/**
- * @brief Generate a random aggregation object from {min, max, sum, product}.
- */
-std::unique_ptr<cudf::aggregation> get_random_agg()
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<int> distr(1, 4);
-  switch (distr(gen)) {
-    case 1: return cudf::make_min_aggregation();
-    case 2: return cudf::make_max_aggregation();
-    case 3: return cudf::make_sum_aggregation();
-    case 4: return cudf::make_product_aggregation();
-    default: CUDF_UNREACHABLE("This should not be reached.");
+    {
+      auto const inp_data = get_group_labels();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
+
+    // Perform tests on type of the result from computing other aggregations.
+    if (test_other_agg) {
+      auto const inp_data = compute_aggregation(get_random_agg());
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
+
+    *test_run = true;  // test is run successfully
+    return get_empty_output(stream, mr);
   }
-  return nullptr;
-}
+};
 
 }  // namespace
 
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 
-// Number of randomly testing on the input data attributes.
-// For each test, a subset of data attributes will be randomly generated from all the possible input
-// data attributes. The input data corresponding to that subset passed from libcudf will be tested
-// for correctness.
-constexpr int NUM_RANDOM_TESTS = 20;
-
 struct HostUDFTest : cudf::test::BaseFixture {};
 
-TEST_F(HostUDFTest, GroupbyAllInput)
+TEST_F(HostUDFTest, GroupbyBuiltinInput)
 {
   bool test_run   = false;
   auto const keys = int32s_col{0, 1, 2};
   auto const vals = int32s_col{0, 1, 2};
   auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run));
+    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ false));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back();
@@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput)
   requests[0].aggregations.push_back(std::move(agg));
   cudf::groupby::groupby gb_obj(
     cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-  [[maybe_unused]] auto const grp_result =
-    gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+    requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_TRUE(test_run);
 }
 
-TEST_F(HostUDFTest, GroupbySomeInput)
+TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations)
 {
-  auto const keys      = int32s_col{0, 1, 2};
-  auto const vals      = int32s_col{0, 1, 2};
-  auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{
-    cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  auto const keys = int32s_col{0, 1, 2};
+  auto const vals = int32s_col{0, 1, 2};
+
+  constexpr int NUM_RANDOM_TESTS = 20;
+
   for (int i = 0; i < NUM_RANDOM_TESTS; ++i) {
-    bool test_run    = false;
-    auto input_attrs = get_subset(all_attrs);
-    input_attrs.insert(get_random_agg());
-    auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, std::move(input_attrs)));
+    bool test_run = false;
+    auto agg      = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ true));
 
     std::vector<cudf::groupby::aggregation_request> requests;
     requests.emplace_back();
@@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput)
     requests[0].aggregations.push_back(std::move(agg));
     cudf::groupby::groupby gb_obj(
       cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-    [[maybe_unused]] auto const grp_result =
-      gb_obj.aggregate(requests, cudf::test::get_default_stream());
+    [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+      requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     EXPECT_TRUE(test_run);
   }
 }
diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu
new file mode 100644
index 00000000000..67b88c5306b
--- /dev/null
+++ b/cpp/tests/reductions/host_udf_example_tests.cu
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+namespace {
+/**
+ * @brief A host-based UDF implementation for reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_reduction_example : cudf::reduce_host_udf {
+  host_udf_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+    cudf::column_view const& input,
+    cudf::data_type output_dtype,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(
+      input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_reduction_example>();
+  }
+
+  struct reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::scalar> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+      cudf::column_view const& input,
+      cudf::data_type output_dtype,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      if (input.size() == 0) {
+        return cudf::make_default_constructed_scalar(output_dtype, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto const result       = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                   thrust::make_counting_iterator(0),
+                                                   thrust::make_counting_iterator(input.size()),
+                                                   transform_fn{*input_dv_ptr},
+                                                   static_cast<OutputType>(init_value),
+                                                   thrust::plus<>{});
+
+      auto output = cudf::make_numeric_scalar(output_dtype, stream, mr);
+      static_cast<cudf::scalar_type_t<OutputType>*>(output.get())->set_value(result, stream);
+      return output;
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      OutputType __device__ operator()(cudf::size_type idx) const
+      {
+        if (values.is_null(idx)) { return OutputType{0}; }
+        auto const val = static_cast<OutputType>(values.element<InputType>(idx));
+        return val * val;
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFReductionExampleTest, SimpleInput)
+{
+  auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_TRUE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+  auto const result =
+    static_cast<cudf::scalar_type_t<int64_t>*>(reduced.get())->value(cudf::get_default_stream());
+  auto constexpr expected = 55;  // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55
+  EXPECT_EQ(expected, result);
+}
+
+TEST_F(HostUDFReductionExampleTest, EmptyInput)
+{
+  auto const vals = doubles_col{};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_FALSE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+}
+
+namespace {
+
+/**
+ * @brief A host-based UDF implementation for segmented reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf {
+  host_udf_segmented_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    cudf::column_view const& input,
+    cudf::device_span<cudf::size_type const> offsets,
+    cudf::data_type output_dtype,
+    cudf::null_policy null_handling,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(input.type(),
+                                        output_dtype,
+                                        segmented_reduce_fn{},
+                                        input,
+                                        offsets,
+                                        output_dtype,
+                                        null_handling,
+                                        init,
+                                        stream,
+                                        mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_segmented_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_segmented_reduction_example>();
+  }
+
+  struct segmented_reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(
+      cudf::column_view const& input,
+      cudf::device_span<cudf::size_type const> offsets,
+      cudf::data_type output_dtype,
+      cudf::null_policy null_handling,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
+      auto const num_segments = static_cast<cudf::size_type>(offsets.size()) - 1;
+
+      if (input.size() == 0) {
+        if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); }
+        return cudf::make_numeric_column(
+          output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto output             = cudf::make_numeric_column(
+        output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream);
+
+      // Store row index if it is valid, otherwise store a negative value denoting a null row.
+      rmm::device_uvector<cudf::size_type> valid_idx(num_segments, stream);
+
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_segments),
+        thrust::make_zip_iterator(output->mutable_view().begin<OutputType>(), valid_idx.begin()),
+        transform_fn{*input_dv_ptr, offsets, static_cast<OutputType>(init_value), null_handling});
+
+      auto const valid_idx_cv = cudf::column_view{
+        cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0};
+      return std::move(cudf::gather(cudf::table_view{{output->view()}},
+                                    valid_idx_cv,
+                                    cudf::out_of_bounds_policy::NULLIFY,
+                                    stream,
+                                    mr)
+                         ->release()
+                         .front());
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      cudf::device_span<cudf::size_type const> offsets;
+      OutputType init_value;
+      cudf::null_policy null_handling;
+
+      thrust::tuple<OutputType, cudf::size_type> __device__ operator()(cudf::size_type idx) const
+      {
+        auto const start = offsets[idx];
+        auto const end   = offsets[idx + 1];
+
+        auto constexpr invalid_idx = cuda::std::numeric_limits<cudf::size_type>::lowest();
+        if (start == end) { return {OutputType{0}, invalid_idx}; }
+
+        auto sum = init_value;
+        for (auto i = start; i < end; ++i) {
+          if (values.is_null(i)) {
+            if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; }
+            continue;
+          }
+          auto const val = static_cast<OutputType>(values.element<InputType>(i));
+          sum += val * val;
+        }
+        auto const segment_size = end - start;
+        return {static_cast<OutputType>(segment_size) * sum, idx};
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput)
+{
+  double constexpr null = 0.0;
+  auto const vals       = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0},
+                                      {true, false, true, true, false, true, false, false, true, true}};
+  auto const offsets    = int32s_col{0, 3, 5, 10}.release();
+  auto const agg        = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+
+  // Test without init value.
+  {
+    auto const result = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      std::nullopt,  // init value
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // Since init value is not given, it is set to 0.
+    // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{12, 18, 850}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and include nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{48, 42, 955}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and exclude nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::EXCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{21, 24, 865}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{0, 0, 0, 0}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{{0, 0, 0}, {false, false, false}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 2276b223740..c07a58ed8a5 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 27966ddfdd4..234a9ec1ced 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index ba8ae379bae..4f047a68f06 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -161,14 +161,14 @@ public static ReductionAggregation median() {
   /**
    * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
    */
-  public static ReductionAggregation quantile(double ... quantiles) {
+  public static ReductionAggregation quantile(double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(quantiles));
   }
 
   /**
    * Aggregate to compute various quantiles.
    */
-  public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) {
+  public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(method, quantiles));
   }
 
@@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() {
    * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
    */
   public static ReductionAggregation collectSet(NullPolicy nullPolicy,
-      NullEquality nullEquality, NaNEquality nanEquality) {
+                                                NullEquality nullEquality, NaNEquality nanEquality) {
     return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
   }
 
@@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new ReductionAggregation instance
+   */
+  public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new ReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
+
   /**
    * Create HistogramAggregation, computing the frequencies for each unique row.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
index 7ed150a2fec..18e7d874886 100644
--- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() {
   public static SegmentedReductionAggregation all() {
     return new SegmentedReductionAggregation(Aggregation.all());
   }
+
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new SegmentedReductionAggregation instance
+   */
+  public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
 }

From bbf4f7824c23c0c482f52bafdf1ece1213da2f65 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 13 Jan 2025 11:44:54 -0800
Subject: [PATCH 2/4] Host compression (#17656)

Add compression APIs to make the nvCOMP use transparent.
Remove direct dependency on nvCOMP in the ORC and Parquet writers.
Add multi-threaded host-side compression; currently off by default, can only be enabled via `LIBCUDF_USE_HOST_COMPRESSION` environment variable.

Currently the host compression adds D2H + H2D transfers. Avoiding the H2D transfer requires large changes to the writers.

Also moved handling of the AUTO compression type to the options classes, which should own such defaults (translate AUTO to SNAPPY in this case).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17656
---
 cpp/CMakeLists.txt                         |   2 +-
 cpp/include/cudf/io/orc.hpp                |  22 ++-
 cpp/src/io/comp/comp.cpp                   | 163 ++++++++++++++++++++-
 cpp/src/io/comp/{statistics.cu => comp.cu} |   4 +-
 cpp/src/io/comp/comp.hpp                   |  54 ++++++-
 cpp/src/io/comp/gpuinflate.hpp             |  15 +-
 cpp/src/io/functions.cpp                   |   3 +-
 cpp/src/io/orc/orc_gpu.hpp                 |   4 +-
 cpp/src/io/orc/stripe_enc.cu               |  49 +------
 cpp/src/io/orc/writer_impl.cu              | 144 ++++++------------
 cpp/src/io/orc/writer_impl.hpp             |   4 +-
 cpp/src/io/parquet/writer_impl.cu          |  99 +++++--------
 cpp/src/io/parquet/writer_impl.hpp         |   4 +-
 cpp/src/io/parquet/writer_impl_helpers.cpp |  46 +-----
 cpp/src/io/parquet/writer_impl_helpers.hpp |  38 +----
 cpp/tests/io/orc_test.cpp                  |   3 +-
 cpp/tests/io/parquet_misc_test.cpp         |   3 +-
 17 files changed, 338 insertions(+), 319 deletions(-)
 rename cpp/src/io/comp/{statistics.cu => comp.cu} (96%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9dabe4e8800..252cc7897d8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -477,13 +477,13 @@ add_library(
   src/io/avro/reader_impl.cu
   src/io/comp/brotli_dict.cpp
   src/io/comp/comp.cpp
+  src/io/comp/comp.cu
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
   src/io/comp/nvcomp_adapter.cpp
   src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
-  src/io/comp/statistics.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
   src/io/csv/csv_gpu.cu
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 163fa20806d..82f7761da2e 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ class orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify frequency of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -733,7 +733,11 @@ class orc_writer_options {
    *
    * @param comp Compression type
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection.
@@ -865,7 +869,7 @@ class orc_writer_options_builder {
    */
   orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
@@ -1026,7 +1030,7 @@ class chunked_orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify granularity of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -1157,7 +1161,11 @@ class chunked_orc_writer_options {
    *
    * @param comp The compression type to use
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection
@@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder {
    */
   chunked_orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index 26535bed43b..3800835eaf1 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,45 @@
 
 #include "comp.hpp"
 
+#include "gpuinflate.hpp"
+#include "io/utilities/getenv_or.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "nvcomp_adapter.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <zlib.h>  // GZIP compression
 
 namespace cudf::io::detail {
 
 namespace {
 
+auto& h_comp_pool()
+{
+  static std::size_t pool_size =
+    getenv_or("LIBCUDF_HOST_COMPRESSION_NUM_THREADS", std::thread::hardware_concurrency());
+  static BS::thread_pool pool(pool_size);
+  return pool;
+}
+
+std::optional<nvcomp::compression_type> to_nvcomp_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case compression_type::ZSTD: return nvcomp::compression_type::ZSTD;
+    case compression_type::LZ4: return nvcomp::compression_type::LZ4;
+    case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE;
+    default: return std::nullopt;
+  }
+}
+
 /**
  * @brief GZIP host compressor (includes header)
  */
@@ -98,8 +121,132 @@ std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
   return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
 }
 
+void device_compress(compression_type compression,
+                     device_span<device_span<uint8_t const> const> inputs,
+                     device_span<device_span<uint8_t> const> outputs,
+                     device_span<compression_result> results,
+                     rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  auto nvcomp_disabled   = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type)
+                                                   : "invalid compression type";
+  if (not nvcomp_disabled) {
+    return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream);
+  }
+
+  switch (compression) {
+    case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream);
+    default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value());
+  }
+}
+
+void host_compress(compression_type compression,
+                   device_span<device_span<uint8_t const> const> inputs,
+                   device_span<device_span<uint8_t> const> outputs,
+                   device_span<compression_result> results,
+                   rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const num_chunks = inputs.size();
+  auto h_results        = cudf::detail::make_host_vector<compression_result>(num_chunks, stream);
+  auto const h_inputs   = cudf::detail::make_host_vector_async(inputs, stream);
+  auto const h_outputs  = cudf::detail::make_host_vector_async(outputs, stream);
+  stream.synchronize();
+
+  std::vector<std::future<size_t>> tasks;
+  auto const num_streams =
+    std::min<std::size_t>({num_chunks,
+                           cudf::detail::global_cuda_stream_pool().get_stream_pool_size(),
+                           h_comp_pool().get_thread_count()});
+  auto const streams = cudf::detail::fork_streams(stream, num_streams);
+  for (size_t i = 0; i < num_chunks; ++i) {
+    auto const cur_stream = streams[i % streams.size()];
+    auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t {
+      auto const h_in  = cudf::detail::make_host_vector_sync(d_in, cur_stream);
+      auto const h_out = compress(compression, h_in, cur_stream);
+      cudf::detail::cuda_memcpy<uint8_t>(d_out.subspan(0, h_out.size()), h_out, cur_stream);
+      return h_out.size();
+    };
+    tasks.emplace_back(h_comp_pool().submit_task(std::move(task)));
+  }
+
+  for (auto i = 0ul; i < num_chunks; ++i) {
+    h_results[i] = {tasks[i].get(), compression_status::SUCCESS};
+  }
+  cudf::detail::cuda_memcpy_async<compression_result>(results, h_results, stream);
+}
+
+[[nodiscard]] bool host_compression_supported(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::GZIP:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool device_compression_supported(compression_type compression)
+{
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  switch (compression) {
+    case compression_type::LZ4:
+    case compression_type::ZLIB:
+    case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value());
+    case compression_type::SNAPPY:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool use_host_compression(
+  compression_type compression,
+  [[maybe_unused]] device_span<device_span<uint8_t const> const> inputs,
+  [[maybe_unused]] device_span<device_span<uint8_t> const> outputs)
+{
+  CUDF_EXPECTS(
+    not host_compression_supported(compression) or device_compression_supported(compression),
+    "Unsupported compression type");
+  if (not host_compression_supported(compression)) { return false; }
+  if (not device_compression_supported(compression)) { return true; }
+  // If both host and device compression are supported, use the host if the env var is set
+  return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0);
+}
+
 }  // namespace
 
+std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
+{
+  if (auto nvcomp_type = to_nvcomp_compression(compression);
+      nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type);
+  }
+  return std::nullopt;
+}
+
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression)
+{
+  auto nvcomp_type = to_nvcomp_compression(compression);
+  if (compression == compression_type::NONE or not nvcomp_type.has_value() or
+      nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return 1ul;
+  }
+
+  return nvcomp::required_alignment(*nvcomp_type);
+}
+
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size)
+{
+  if (compression == compression_type::NONE) { return uncompressed_size; }
+
+  if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) {
+    return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size);
+  }
+  CUDF_FAIL("Unsupported compression type");
+}
+
 std::vector<std::uint8_t> compress(compression_type compression,
                                    host_span<uint8_t const> src,
                                    rmm::cuda_stream_view stream)
@@ -112,4 +259,18 @@ std::vector<std::uint8_t> compress(compression_type compression,
   }
 }
 
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (use_host_compression(compression, inputs, outputs)) {
+    return host_compress(compression, inputs, outputs, results, stream);
+  } else {
+    return device_compress(compression, inputs, outputs, results, stream);
+  }
+}
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/comp.cu
similarity index 96%
rename from cpp/src/io/comp/statistics.cu
rename to cpp/src/io/comp/comp.cu
index caee9145d2c..af0f73869a2 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/comp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "gpuinflate.hpp"
+#include "comp.hpp"
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp
index e16f26e1f06..90932a11499 100644
--- a/cpp/src/io/comp/comp.hpp
+++ b/cpp/src/io/comp/comp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,5 +57,57 @@ std::vector<uint8_t> compress(compression_type compression,
                               host_span<uint8_t const> src,
                               rmm::cuda_stream_view stream);
 
+/**
+ * @brief Maximum size of uncompressed chunks that can be compressed.
+ *
+ * @param compression Compression type
+ * @returns maximum chunk size
+ */
+[[nodiscard]] std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression);
+
+/**
+ * @brief Gets input and output alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment
+ */
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression);
+
+/**
+ * @brief Gets the maximum size any chunk could compress to in the batch.
+ *
+ * @param compression Compression type
+ * @param uncompressed_size Size of the largest uncompressed chunk in the batch
+ */
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size);
+
+/**
+ * @brief Compresses device memory buffers.
+ *
+ * @param compression Type of compression of the input data
+ * @param inputs      Device memory buffers to compress
+ * @param outputs     Device memory buffers to store the compressed output
+ * @param results     Compression results
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ */
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Aggregate results of compression into a single statistics object.
+ *
+ * @param inputs List of uncompressed input buffers
+ * @param results List of compression results
+ * @param stream CUDA stream to use
+ * @return writer_compression_statistics
+ */
+[[nodiscard]] writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream);
+
 }  // namespace io::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 4b09bd5a84c..0a35b230242 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,17 +124,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
-/**
- * @brief Aggregate results of compression into a single statistics object.
- *
- * @param inputs List of uncompressed input buffers
- * @param results List of compression results
- * @param stream CUDA stream to use
- * @return writer_compression_statistics
- */
-[[nodiscard]] writer_compression_statistics collect_compression_statistics(
-  device_span<device_span<uint8_t const> const> inputs,
-  device_span<compression_result const> results,
-  rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 88423122e16..d63fa9f5c35 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l
 void parquet_writer_options_base::set_compression(compression_type compression)
 {
   _compression = compression;
+  if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
 }
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index f4e75f78dec..8b30cee6681 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -407,7 +407,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 79ecca0ca99..4f296bb5bfc 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
@@ -45,8 +44,6 @@ namespace io {
 namespace orc {
 namespace gpu {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 using cudf::detail::device_2dspan;
 using cudf::io::detail::compression_result;
 using cudf::io::detail::compression_status;
@@ -1362,7 +1359,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
@@ -1387,47 +1384,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
                                                                             max_comp_blk_size,
                                                                             comp_block_align);
 
-  if (compression == SNAPPY) {
-    try {
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-    } catch (...) {
-      // There was an error in compressing so set an error status for each block
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        comp_res.begin(),
-        comp_res.end(),
-        [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; });
-      // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
-      // writing without compression
-      CUDF_LOG_WARN("ORC writer: compression failed, writing uncompressed data");
-    }
-  } else if (compression == ZLIB) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(
-      nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
-  } else if (compression == ZSTD) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-  } else if (compression == LZ4) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-  } else if (compression != NONE) {
-    CUDF_FAIL("Unsupported compression type");
-  }
+  cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream);
 
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ce868b83c04..aa0b509981a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
@@ -71,8 +70,6 @@
 
 namespace cudf::io::orc::detail {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 template <typename T>
 [[nodiscard]] constexpr int varint_size(T val)
 {
@@ -92,21 +89,8 @@ struct row_group_index_info {
 };
 
 namespace {
-
 /**
- * @brief Translates ORC compression to nvCOMP compression
- */
-auto to_nvcomp_compression_type(CompressionKind compression_kind)
-{
-  if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE;
-  if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD;
-  if (compression_kind == LZ4) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-/**
- * @brief Translates cuDF compression to ORC compression
+ * @brief Translates cuDF compression to ORC compression.
  */
 orc::CompressionKind to_orc_compression(compression_type compression)
 {
@@ -122,19 +106,14 @@ orc::CompressionKind to_orc_compression(compression_type compression)
 }
 
 /**
- * @brief Returns the block size for a given compression kind.
+ * @brief Returns the block size for a given compression format.
  */
-constexpr size_t compression_block_size(orc::CompressionKind compression)
+size_t compression_block_size(compression_type compression)
 {
-  if (compression == orc::CompressionKind::NONE) { return 0; }
-
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
   constexpr size_t max_block_size = 256 * 1024;
-  return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
+  return std::min(comp_limit.value_or(max_block_size), max_block_size);
 }
 
 /**
@@ -534,26 +513,6 @@ size_t RLE_stream_size(TypeKind kind, size_t count)
   }
 }
 
-auto uncomp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
-auto comp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
 /**
  * @brief Builds up per-column streams.
  *
@@ -566,7 +525,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            file_segmentation const& segmentation,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
-                           CompressionKind compression_kind,
+                           compression_type compression,
                            single_write_mode write_mode)
 {
   // 'column 0' row index stream
@@ -610,7 +569,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
+        auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -1473,7 +1432,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
  * @param[in] rg_stats row group level statistics
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] compression_blocksize The block size used for compression
  * @param[in] out_sink Sink for writing data
  */
@@ -1487,7 +1446,7 @@ void write_index_stream(int32_t stripe_id,
                         host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
                         orc_streams* streams,
-                        CompressionKind compression_kind,
+                        compression_type compression,
                         size_t compression_blocksize,
                         std::unique_ptr<data_sink> const& out_sink)
 {
@@ -1501,7 +1460,7 @@ void write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind != NONE) {
+      if (compression != compression_type::NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1541,7 +1500,7 @@ void write_index_stream(int32_t stripe_id,
     }
   }
 
-  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression != compression_type::NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
@@ -1566,7 +1525,7 @@ void write_index_stream(int32_t stripe_id,
   });
 
   (*streams)[stream_id].length = pbw.size();
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
     pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
     pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
@@ -1585,7 +1544,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] out_sink Sink for writing data
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @return An std::future that should be synchronized to ensure the writing is complete
@@ -1596,7 +1555,7 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
-                                    CompressionKind compression_kind,
+                                    compression_type compression,
                                     std::unique_ptr<data_sink> const& out_sink,
                                     rmm::cuda_stream_view stream)
 {
@@ -1606,8 +1565,9 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     return std::async(std::launch::deferred, [] {});
   }
 
-  auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                     : (compressed_data + strm_desc.bfr_offset);
+  auto const* stream_in = (compression == compression_type::NONE)
+                            ? enc_stream.data_ptrs[strm_desc.stream_type]
+                            : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
     if (out_sink->is_device_write_preferred(length)) {
@@ -1627,15 +1587,15 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
 /**
  * @brief Insert 3-byte uncompressed block headers in a byte vector
  *
- * @param compression_kind The compression kind
+ * @param compression The compression kind
  * @param compression_blocksize The block size used for compression
  * @param v The destitation byte vector to write, which must include initial 3-byte header
  */
-void add_uncompressed_block_headers(CompressionKind compression_kind,
+void add_uncompressed_block_headers(compression_type compression,
                                     size_t compression_blocksize,
                                     std::vector<uint8_t>& v)
 {
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
     while (uncomp_len > compression_blocksize) {
       block_len  = compression_blocksize * 2 + 1;
@@ -2021,14 +1981,6 @@ std::map<uint32_t, size_t> decimal_column_sizes(
   return column_sizes;
 }
 
-size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
-{
-  if (compression_kind == NONE) return 0;
-
-  return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
-                                                compression_blocksize);
-}
-
 std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
   auto table_meta = std::make_unique<table_input_metadata>(input);
@@ -2287,7 +2239,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
  * @param row_index_stride The row index stride
  * @param enable_dictionary Whether dictionary is enabled
  * @param sort_dictionaries Whether to sort the dictionaries
- * @param compression_kind The compression kind
+ * @param compression The compression format
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
  * @param collect_compression_stats Flag to indicate if compression statistics should be collected
@@ -2302,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                size_type row_index_stride,
                                bool enable_dictionary,
                                bool sort_dictionaries,
-                               CompressionKind compression_kind,
+                               compression_type compression,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
                                bool collect_compression_stats,
@@ -2329,17 +2281,16 @@ auto convert_table_to_orc_data(table_view const& input,
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+  auto const block_align = compress_required_chunk_alignment(compression);
 
   auto streams  = create_streams(orc_table.columns,
                                 segmentation,
                                 decimal_column_sizes(dec_chunk_sizes.rg_sizes),
                                 enable_dictionary,
-                                compression_kind,
+                                compression,
                                 write_mode);
   auto enc_data = encode_columns(
-    orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream);
+    orc_table, std::move(dec_chunk_sizes), segmentation, streams, block_align, stream);
 
   stripe_dicts.on_encode_complete(stream);
 
@@ -2371,16 +2322,15 @@ auto convert_table_to_orc_data(table_view const& input,
   size_t compressed_bfr_size   = 0;
   size_t num_compressed_blocks = 0;
 
-  auto const max_compressed_block_size =
-    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize);
   auto const padded_max_compressed_block_size =
-    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(max_compressed_block_size, block_align);
   auto const padded_block_header_size =
-    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(block_header_size, block_align);
 
   for (auto& ss : strm_descs.host_view().flat_view()) {
     size_t stream_size = ss.stream_size;
-    if (compression_kind != NONE) {
+    if (compression != compression_type::NONE) {
       ss.first_block = num_compressed_blocks;
       ss.bfr_offset  = compressed_bfr_size;
 
@@ -2401,14 +2351,14 @@ auto convert_table_to_orc_data(table_view const& input,
                comp_results.d_begin(),
                comp_results.d_end(),
                compression_result{0, compression_status::FAILURE});
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     strm_descs.host_to_device_async(stream);
     compression_stats = gpu::CompressOrcDataStreams(compressed_data,
                                                     num_compressed_blocks,
-                                                    compression_kind,
+                                                    compression,
                                                     compression_blocksize,
                                                     max_compressed_block_size,
-                                                    compressed_block_align,
+                                                    block_align,
                                                     collect_compression_stats,
                                                     strm_descs,
                                                     enc_data.streams,
@@ -2459,8 +2409,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2480,8 +2430,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2526,7 +2476,7 @@ void writer::impl::write(table_view const& input)
                                        _row_index_stride,
                                        _enable_dictionary,
                                        _sort_dictionaries,
-                                       _compression_kind,
+                                       _compression,
                                        _compression_blocksize,
                                        _stats_freq,
                                        _compression_statistics != nullptr,
@@ -2613,7 +2563,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                          rg_stats,
                          &stripe,
                          &streams,
-                         _compression_kind,
+                         _compression,
                          _compression_blocksize,
                          _out_sink);
     }
@@ -2627,7 +2577,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         bounce_buffer,
         &stripe,
         &streams,
-        _compression_kind,
+        _compression,
         _out_sink,
         _stream));
     }
@@ -2645,10 +2595,10 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
           : 0;
       if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
     }
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(sf);
     stripe.footerLength = pbw.size();
-    if (_compression_kind != NONE) {
+    if (_compression != compression_type::NONE) {
       uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
       pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
       pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
@@ -2780,21 +2730,21 @@ void writer::impl::close()
 
   // Write statistics metadata
   if (not _orc_meta.stripeStats.empty()) {
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(_orc_meta);
-    add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+    add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
     ps.metadataLength = pbw.size();
     _out_sink->host_write(pbw.data(), pbw.size());
   } else {
     ps.metadataLength = 0;
   }
-  ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
   pbw.write(_footer);
-  add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+  add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
 
   // Write postscript metadata
   ps.footerLength         = pbw.size();
-  ps.compression          = _compression_kind;
+  ps.compression          = to_orc_compression(_compression);
   ps.compressionBlockSize = _compression_blocksize;
   ps.version              = {0, 12};  // Hive 0.12
   ps.writerVersion        = cudf_writer_version;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index cae849ee315..7d23482cb17 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -342,7 +342,7 @@ class writer::impl {
   // Writer options.
   stripe_size_limits const _max_stripe_size;
   size_type const _row_index_stride;
-  CompressionKind const _compression_kind;
+  compression_type const _compression;
   size_t const _compression_blocksize;
   std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 77924ac0f35..1b67b53ae8e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -23,8 +23,7 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "interop/decimal_conversion_utilities.cuh"
-#include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
+#include "io/comp/comp.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -67,6 +66,20 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
                             host_span<std::map<std::string, std::string> const> kv_md,
@@ -1172,7 +1185,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
                      bool write_v2_headers,
-                     Compression compression_codec,
+                     compression_type compression,
                      rmm::cuda_stream_view stream)
 {
   if (chunks.is_empty()) { return cudf::detail::hostdevice_vector<size_type>{}; }
@@ -1187,7 +1200,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1212,7 +1225,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1221,12 +1234,10 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
 
   // Get per-page max compressed size
   cudf::detail::hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
-  std::transform(page_sizes.begin(),
-                 page_sizes.end(),
-                 comp_page_sizes.begin(),
-                 [compression_codec](auto page_size) {
-                   return max_compression_output_size(compression_codec, page_size);
-                 });
+  std::transform(
+    page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [compression](auto page_size) {
+      return max_compressed_size(compression, page_size);
+    });
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
@@ -1238,7 +1249,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1247,16 +1258,13 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
   return comp_page_sizes;
 }
 
-size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
+size_t max_page_bytes(compression_type compression, size_t max_page_size_bytes)
 {
-  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == compression_type::NONE) { return max_page_size_bytes; }
 
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
-  auto max_size = std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
+  auto max_size = std::min(comp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
   // page size must fit in a 32-bit signed integer
   return std::min<size_t>(max_size, std::numeric_limits<int32_t>::max());
 }
@@ -1265,7 +1273,7 @@ std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_u
 build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
                          host_span<parquet_column_device_view const> col_desc,
                          device_2dspan<PageFragment const> frags,
-                         Compression compression,
+                         compression_type compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
                          rmm::cuda_stream_view stream)
@@ -1404,7 +1412,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
  * @param num_columns Total number of columns
  * @param num_pages Total number of pages
  * @param num_stats_bfr Number of statistics buffers
- * @param compression Compression format
+ * @param alignment Page alignment
  * @param max_page_size_bytes Maximum uncompressed page size, in bytes
  * @param max_page_size_rows Maximum page size, in rows
  * @param write_v2_headers True if version 2 page headers are to be written
@@ -1419,7 +1427,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                         uint32_t num_columns,
                         uint32_t num_pages,
                         uint32_t num_stats_bfr,
-                        Compression compression,
+                        size_t alignment,
                         size_t max_page_size_bytes,
                         size_type max_page_size_rows,
                         bool write_v2_headers,
@@ -1435,7 +1443,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression),
+                   alignment,
                    write_v2_headers,
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
@@ -1478,7 +1486,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
                   std::optional<writer_compression_statistics>& comp_stats,
-                  Compression compression,
+                  compression_type compression,
                   int32_t column_index_truncate_length,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
@@ -1488,7 +1496,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                            ? device_span<statistics_chunk const>(page_stats, num_pages)
                            : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
+  uint32_t max_comp_pages = (compression != compression_type::NONE) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1499,34 +1507,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                compression_result{0, compression_status::FAILURE});
 
   EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
-  switch (compression) {
-    case Compression::SNAPPY:
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-      break;
-    case Compression::ZSTD: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::LZ4_RAW: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::UNCOMPRESSED: break;
-    default: CUDF_FAIL("invalid compression type");
-  }
+  compress(compression, comp_in, comp_out, comp_res, stream);
 
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
@@ -1744,7 +1725,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_type max_page_size_rows,
                                    int32_t column_index_truncate_length,
                                    statistics_freq stats_granularity,
-                                   Compression compression,
+                                   compression_type compression,
                                    bool collect_compression_statistics,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
@@ -2146,7 +2127,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == compression_type::NONE) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers
   uint32_t const num_stats_bfr =
@@ -2214,7 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                        num_columns,
                        num_pages,
                        num_stats_bfr,
-                       compression,
+                       compress_required_chunk_alignment(compression),
                        max_page_size_bytes,
                        max_page_size_rows,
                        write_v2_headers,
@@ -2270,7 +2251,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         auto const dev_bfr      = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr;
         auto& column_chunk_meta = row_group.columns[i].meta_data;
 
-        if (ck.is_compressed) { column_chunk_meta.codec = compression; }
+        if (ck.is_compressed) { column_chunk_meta.codec = to_parquet_compression(compression); }
         if (!out_sink[p]->is_device_write_preferred(ck.compressed_size)) {
           all_device_write = false;
         }
@@ -2375,7 +2356,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
@@ -2406,7 +2387,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 63128faf993..d5a5a534b93 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,7 +144,7 @@ class writer::impl {
   rmm::cuda_stream_view _stream;
 
   // Writer options.
-  Compression const _compression;
+  compression_type const _compression;
   size_t const _max_row_group_size;
   size_type const _max_row_group_rows;
   size_t const _max_page_size_bytes;
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index f15ea1f3c37..ede788c97c2 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #include "writer_impl_helpers.hpp"
 
-#include "io/comp/nvcomp_adapter.hpp"
-
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,48 +30,6 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
-{
-  switch (codec) {
-    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
-    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
-    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-uint32_t page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 void fill_table_meta(table_input_metadata& table_meta)
 {
   // Fill unnamed columns' names in table_meta
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 14a9a0ed5b7..b5c73c348fe 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,48 +20,12 @@
  */
 
 #pragma once
-#include "parquet_common.hpp"
 
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/io/detail/parquet.hpp>
-#include <cudf/io/nvcomp_adapter.hpp>
 
 namespace cudf::io::parquet::detail {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression);
-
-/**
- * @brief Function that translates the given compression codec to nvcomp compression type.
- *
- * @param codec Compression codec
- * @return Translated nvcomp compression type
- */
-cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
-
-/**
- * @brief Function that computes input alignment requirements for the given compression type.
- *
- * @param codec Compression codec
- * @return Required alignment
- */
-uint32_t page_alignment(Compression codec);
-
-/**
- * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
- *        batch.
- *
- * @param codec Compression codec
- * @param compression_blocksize Size of the largest uncompressed chunk in the batch
- * @return Maximum compressed chunk size
- */
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
-
 /**
  * @brief Fill the table metadata with default column names.
  *
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 2209a30149d..708c2045a74 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2068,6 +2068,7 @@ TEST_P(OrcCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(OrcCompressionTest,
                         OrcCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index d66f685cd9c..419ac909ac6 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -268,6 +268,7 @@ TEST_P(ParquetCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(ParquetCompressionTest,
                         ParquetCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));

From 478ec50edf302a338db043039abad6a2560144ea Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 13 Jan 2025 15:19:44 -0600
Subject: [PATCH 3/4] Precompute AST arity (#17234)

This PR precomputes AST arity on the host, to reduce the complexity in device-side arity lookup.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17234
---
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/ast/detail/expression_evaluator.cuh  |   4 +-
 .../cudf/ast/detail/expression_parser.hpp     |  50 ++-
 cpp/include/cudf/ast/detail/operators.hpp     | 418 +++---------------
 cpp/src/ast/expression_parser.cpp             |   3 +-
 cpp/src/ast/operators.cpp                     | 293 ++++++++++++
 6 files changed, 391 insertions(+), 378 deletions(-)
 create mode 100644 cpp/src/ast/operators.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 252cc7897d8..4d83cbd907c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -336,6 +336,7 @@ add_library(
   src/aggregation/result_cache.cpp
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
+  src/ast/operators.cpp
   src/binaryop/binaryop.cpp
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/Add.cu
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 9d8762555d7..001b604814c 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -452,7 +452,7 @@ struct expression_evaluator {
          ++operator_index) {
       // Execute operator
       auto const op    = plan.operators[operator_index];
-      auto const arity = ast_operator_arity(op);
+      auto const arity = plan.operator_arities[operator_index];
       if (arity == 1) {
         // Unary operator
         auto const& input =
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index b5973d0ace9..d2e8c1cd41f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -88,6 +89,7 @@ struct expression_device_view {
   device_span<detail::device_data_reference const> data_references;
   device_span<generic_scalar_device_view const> literals;
   device_span<ast_operator const> operators;
+  device_span<cudf::size_type const> operator_arities;
   device_span<cudf::size_type const> operator_source_indices;
   cudf::size_type num_intermediates;
 };
@@ -229,39 +231,55 @@ class expression_parser {
    * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
    * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
    * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   * @param[in,out]  alignment  The maximum alignment needed for all the extracted size and pointers
    */
   template <typename T>
   void extract_size_and_pointer(std::vector<T> const& v,
                                 std::vector<cudf::size_type>& sizes,
-                                std::vector<void const*>& data_pointers)
+                                std::vector<void const*>& data_pointers,
+                                cudf::size_type& alignment)
   {
+    // sub-type alignment will only work provided the alignment is lesser or equal to
+    // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers
+    static_assert(alignof(T) <= alignof(max_align_t));
     auto const data_size = sizeof(T) * v.size();
     sizes.push_back(data_size);
     data_pointers.push_back(v.data());
+    alignment = std::max(alignment, static_cast<cudf::size_type>(alignof(T)));
   }
 
   void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
+    // use a minimum of 4-byte alignment
+    cudf::size_type buffer_alignment = 4;
 
-    extract_size_and_pointer(_data_references, sizes, data_pointers);
-    extract_size_and_pointer(_literals, sizes, data_pointers);
-    extract_size_and_pointer(_operators, sizes, data_pointers);
-    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers);
+    extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment);
 
     // Create device buffer
-    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
-    auto buffer_offsets    = std::vector<int>(sizes.size());
-    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+    auto buffer_offsets = std::vector<cudf::size_type>(sizes.size());
+    thrust::exclusive_scan(sizes.cbegin(),
+                           sizes.cend(),
+                           buffer_offsets.begin(),
+                           cudf::size_type{0},
+                           [buffer_alignment](auto a, auto b) {
+                             // align each component of the AST program
+                             return cudf::util::round_up_safe(a + b, buffer_alignment);
+                           });
+
+    auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back());
+    auto host_data_buffer  = std::vector<char>(buffer_size);
 
-    auto h_data_buffer = std::vector<char>(buffer_size);
     for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
+      std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
     }
 
-    _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr);
-
+    _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr);
     stream.synchronize();
 
     // Create device pointers to components of plan
@@ -277,8 +295,11 @@ class expression_parser {
     device_expression_data.operators = device_span<ast_operator const>(
       reinterpret_cast<ast_operator const*>(device_data_buffer_ptr + buffer_offsets[2]),
       _operators.size());
-    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+    device_expression_data.operator_arities = device_span<cudf::size_type const>(
       reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[3]),
+      _operators.size());
+    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+      reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[4]),
       _operator_source_indices.size());
     device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
     shmem_per_thread                         = static_cast<int>(
@@ -322,6 +343,7 @@ class expression_parser {
   bool _has_nulls;
   std::vector<detail::device_data_reference> _data_references;
   std::vector<ast_operator> _operators;
+  std::vector<cudf::size_type> _operator_arities;
   std::vector<cudf::size_type> _operator_source_indices;
   std::vector<generic_scalar_device_view> _literals;
 };
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 46507700e21..db04e1fe989 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v<Op, T>;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op,
+                                                                         F&& f,
+                                                                         Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
-      f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
     case ast_operator::SUB:
-      f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
     case ast_operator::MUL:
-      f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
     case ast_operator::DIV:
-      f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
     case ast_operator::TRUE_DIV:
-      f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR_DIV:
-      f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
     case ast_operator::MOD:
-      f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
     case ast_operator::PYMOD:
-      f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
     case ast_operator::POW:
-      f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
     case ast_operator::EQUAL:
-      f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NULL_EQUAL:
-      f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NOT_EQUAL:
-      f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::LESS:
-      f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
     case ast_operator::GREATER:
-      f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
     case ast_operator::LESS_EQUAL:
-      f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::GREATER_EQUAL:
-      f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_AND:
-      f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_OR:
-      f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_XOR:
-      f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_AND:
-      f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_AND:
-      f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_OR:
-      f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_OR:
-      f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::IDENTITY:
-      f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
     case ast_operator::IS_NULL:
-      f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
     case ast_operator::SIN:
-      f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
     case ast_operator::COS:
-      f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
     case ast_operator::TAN:
-      f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
     case ast_operator::ARCSIN:
-      f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOS:
-      f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
     case ast_operator::ARCTAN:
-      f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
     case ast_operator::SINH:
-      f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
     case ast_operator::COSH:
-      f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
     case ast_operator::TANH:
-      f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
     case ast_operator::ARCSINH:
-      f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOSH:
-      f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
     case ast_operator::ARCTANH:
-      f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
     case ast_operator::EXP:
-      f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
     case ast_operator::LOG:
-      f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
     case ast_operator::SQRT:
-      f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
     case ast_operator::CBRT:
-      f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
     case ast_operator::CEIL:
-      f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR:
-      f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
     case ast_operator::ABS:
-      f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
     case ast_operator::RINT:
-      f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
     case ast_operator::BIT_INVERT:
-      f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
     case ast_operator::NOT:
-      f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_INT64:
-      f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_UINT64:
-      f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_FLOAT64:
-      f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
@@ -955,231 +907,6 @@ struct operator_functor<ast_operator::NULL_LOGICAL_OR, true> {
   }
 };
 
-/**
- * @brief Functor used to single-type-dispatch binary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
- * the same type, and dispatches based on the type of the left input.
- *
- * @tparam OperatorFunctor Binary operator functor.
- */
-template <typename OperatorFunctor>
-struct single_dispatch_binary_operator_types {
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
-  }
-
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a binary operator.
- *
- * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
- * for all binary operators but holds for all currently implemented operators.
- */
-struct type_dispatch_binary_op {
-  /**
-   * @brief Performs type dispatch for a binary operator.
-   *
-   * @tparam op AST operator.
-   * @tparam F Type of forwarded functor.
-   * @tparam Ts Parameter pack of forwarded arguments.
-   * @param lhs_type Type of left input data.
-   * @param rhs_type Type of right input data.
-   * @param f Forwarded functor to be called.
-   * @param args Forwarded arguments to `operator()` of `f`.
-   */
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
-                                          cudf::data_type rhs_type,
-                                          F&& f,
-                                          Ts&&... args)
-  {
-    // Single dispatch (assume lhs_type == rhs_type)
-    type_dispatcher(
-      lhs_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime binary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param lhs_type Type of left input data.
- * @param rhs_type Type of right input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
-  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_binary_op{},
-                          lhs_type,
-                          rhs_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor used to type-dispatch unary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_unary_op` trait.
- *
- * @tparam OperatorFunctor Unary operator functor.
- */
-template <typename OperatorFunctor>
-struct dispatch_unary_operator_types {
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
-  }
-
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a unary operator.
- */
-struct type_dispatch_unary_op {
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
-  {
-    type_dispatcher(
-      input_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime unary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param input_type Type of input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
-                                                                 cudf::data_type input_type,
-                                                                 F&& f,
-                                                                 Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_unary_op{},
-                          input_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor to determine the return type of an operator from its input types.
- */
-struct return_type_functor {
-  /**
-   * @brief Callable for binary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam LHS Left input type.
-   * @tparam RHS Right input type.
-   * @param result Reference whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
-#endif
-  }
-
-  /**
-   * @brief Callable for unary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam T Input type.
-   * @param result Pointer whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
-#endif
-  }
-};
-
 /**
  * @brief Gets the return type of an AST operator.
  *
@@ -1187,34 +914,8 @@ struct return_type_functor {
  * @param operand_types Vector of input types to the operator.
  * @return cudf::data_type Return type of the operator.
  */
-inline cudf::data_type ast_operator_return_type(ast_operator op,
-                                                std::vector<cudf::data_type> const& operand_types)
-{
-  auto result = cudf::data_type(cudf::type_id::EMPTY);
-  switch (operand_types.size()) {
-    case 1:
-      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
-      break;
-    case 2:
-      binary_operator_dispatcher(
-        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
-      break;
-    default: CUDF_FAIL("Unsupported operator return type."); break;
-  }
-  return result;
-}
-
-/**
- * @brief Functor to determine the arity (number of operands) of an operator.
- */
-struct arity_functor {
-  template <ast_operator op>
-  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
-  {
-    // Arity is not dependent on null handling, so just use the false implementation here.
-    result = operator_functor<op, false>::arity;
-  }
-};
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types);
 
 /**
  * @brief Gets the arity (number of operands) of an AST operator.
@@ -1222,12 +923,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
-{
-  auto result = cudf::size_type(0);
-  ast_operator_dispatcher(op, detail::arity_functor{}, result);
-  return result;
-}
+cudf::size_type ast_operator_arity(ast_operator op);
 
 }  // namespace detail
 
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index d0e4c59ca54..b2cc134d9fa 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,6 +161,7 @@ cudf::size_type expression_parser::visit(operation const& expr)
   auto const op        = expr.get_operator();
   auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types);
   _operators.push_back(op);
+  _operator_arities.push_back(cudf::ast::detail::ast_operator_arity(op));
   // Push data reference
   auto const output = [&]() {
     if (expression_index == 0) {
diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp
new file mode 100644
index 00000000000..b60a69a42d9
--- /dev/null
+++ b/cpp/src/ast/operators.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/type_traits>
+#include <thrust/optional.h>
+
+#include <vector>
+
+namespace cudf {
+namespace ast {
+namespace detail {
+namespace {
+
+struct arity_functor {
+  template <ast_operator op>
+  void operator()(cudf::size_type& result)
+  {
+    // Arity is not dependent on null handling, so just use the false implementation here.
+    result = operator_functor<op, false>::arity;
+  }
+};
+
+/**
+ * @brief Functor to determine the return type of an operator from its input types.
+ */
+struct return_type_functor {
+  /**
+   * @brief Callable for binary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam LHS Left input type.
+   * @tparam RHS Right input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+
+  /**
+   * @brief Callable for unary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam T Input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+};
+
+/**
+ * @brief Functor used to single-type-dispatch binary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
+ * the same type, and dispatches based on the type of the left input.
+ *
+ * @tparam OperatorFunctor Binary operator functor.
+ */
+template <typename OperatorFunctor>
+struct single_dispatch_binary_operator_types {
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
+  }
+
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a binary operator.
+ *
+ * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
+ * for all binary operators but holds for all currently implemented operators.
+ */
+struct type_dispatch_binary_op {
+  /**
+   * @brief Performs type dispatch for a binary operator.
+   *
+   * @tparam op AST operator.
+   * @tparam F Type of forwarded functor.
+   * @tparam Ts Parameter pack of forwarded arguments.
+   * @param lhs_type Type of left input data.
+   * @param rhs_type Type of right input data.
+   * @param f Forwarded functor to be called.
+   * @param args Forwarded arguments to `operator()` of `f`.
+   */
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+  {
+    // Single dispatch (assume lhs_type == rhs_type)
+    type_dispatcher(
+      lhs_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime binary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param lhs_type Type of left input data.
+ * @param rhs_type Type of right input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void binary_operator_dispatcher(
+  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_binary_op{},
+                          lhs_type,
+                          rhs_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+/**
+ * @brief Functor used to type-dispatch unary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_unary_op` trait.
+ *
+ * @tparam OperatorFunctor Unary operator functor.
+ */
+template <typename OperatorFunctor>
+struct dispatch_unary_operator_types {
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
+  }
+
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a unary operator.
+ */
+struct type_dispatch_unary_op {
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  {
+    type_dispatcher(
+      input_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime unary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param input_type Type of input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                cudf::data_type input_type,
+                                                F&& f,
+                                                Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_unary_op{},
+                          input_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+}  // namespace
+
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types)
+{
+  cudf::data_type result{cudf::type_id::EMPTY};
+  switch (operand_types.size()) {
+    case 1:
+      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
+      break;
+    case 2:
+      binary_operator_dispatcher(
+        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
+      break;
+    default: CUDF_FAIL("Unsupported operator return type."); break;
+  }
+  return result;
+}
+
+cudf::size_type ast_operator_arity(ast_operator op)
+{
+  cudf::size_type result{};
+  ast_operator_dispatcher(op, arity_functor{}, result);
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace ast
+
+}  // namespace cudf

From f84cd4316eaa61e231b5fd096608ca09d5e3c08c Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:26:43 -0500
Subject: [PATCH 4/4] [BUG] xfail Polars excel test (#17731)

One the Polars tests fails when `fastexcel>=0.12.1`. I opened https://github.com/pola-rs/polars/issues/20698 to track that failing test. This PR xfail that test for now.

xref #17677

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17731
---
 python/cudf_polars/cudf_polars/testing/plugin.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c16df320ceb..e453a8b89b9 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -8,7 +8,9 @@
 from functools import partialmethod
 from typing import TYPE_CHECKING
 
+import fastexcel
 import pytest
+from packaging import version
 
 import polars
 
@@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None:
     )
 
 
-EXPECTED_FAILURES: Mapping[str, str] = {
+EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
@@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None:
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+    "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
+        "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
+        version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
+    ),
 }
 
 
@@ -219,4 +225,12 @@ def pytest_collection_modifyitems(
         if item.nodeid in TESTS_TO_SKIP:
             item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
         elif item.nodeid in EXPECTED_FAILURES:
+            if isinstance(EXPECTED_FAILURES[item.nodeid], tuple):
+                # the second entry in the tuple is the condition to xfail on
+                item.add_marker(
+                    pytest.mark.xfail(
+                        condition=EXPECTED_FAILURES[item.nodeid][1],
+                        reason=EXPECTED_FAILURES[item.nodeid][0],
+                    ),
+                )
             item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))