diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a1b7db5e08a..2b2a660bed7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -601,7 +601,7 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            data_type output_type);
 
 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
-struct host_udf_base;
+class host_udf_base;
 
 /**
  * @brief Factory to create a HOST_UDF aggregation.
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
index bbce76dc5f3..451d75137e4 100644
--- a/cpp/include/cudf/aggregation/host_udf.hpp
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,16 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <functional>
 #include <optional>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
 
 /**
  * @file host_udf.hpp
@@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief The interface for host-based UDF implementation.
+ * @brief The fundamental interface for host-based UDF implementation.
  *
- * An implementation of host-based UDF needs to be derived from this base class, defining
- * its own version of the required functions. In particular:
- *  - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
- *    and `clone` functions.
- *  - If necessary, the derived class can also override `do_hash` to compute hashing for its
- *    instance, and `get_required_data` to selectively access to the input data as well as
- *    intermediate data provided by libcudf.
+ * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in
+ * the users' UDF implementation. These functions are required for libcudf aggregation framework
+ * to perform its operations.
+ */
+class host_udf_base {
+  // Declare constructor private to prevent the users from deriving from this class.
+ private:
+  host_udf_base() = default;  ///< Default constructor
+
+  // Only allow deriving from the structs below.
+  friend struct reduce_host_udf;
+  friend struct segmented_reduce_host_udf;
+  friend struct groupby_host_udf;
+
+ public:
+  virtual ~host_udf_base() = default;  ///< Default destructor
+
+  /**
+   * @brief Computes hash value of the instance.
+   *
+   * Overriding this function is optional when the derived class has data members such that
+   * each instance needs to be differentiated from each other.
+   *
+   * @return The hash value of the instance
+   */
+  [[nodiscard]] virtual std::size_t do_hash() const
+  {
+    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+  }
+
+  /**
+   * @brief Compares two instances of the derived class for equality.
+   * @param other The other instance to compare with
+   * @return True if the two instances are equal
+   */
+  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+  /**
+   * @brief Clones the instance.
+   *
+   * The instances of the derived class should be lightweight for efficient cloning.
+   *
+   * @return A new instance cloned from this one
+   */
+  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for reduction contexts.
+ *
+ * An implementation of host-based UDF for reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived classes must also define the `operator()` function to perform reduction
+ * operations.
  *
- * Example of such implementation:
+ * Example:
  * @code{.cpp}
- * struct my_udf_aggregation : cudf::host_udf_base {
+ * struct my_udf_aggregation : cudf::reduce_host_udf {
  *   my_udf_aggregation() = default;
  *
- *   // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
- *   // and the result from groupby `MAX` aggregation.
- *   [[nodiscard]] data_attribute_set_t get_required_data() const override
+ *   [[nodiscard]] std::unique_ptr<scalar> operator()(
+ *     column_view const& input,
+ *     data_type output_dtype,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
  *   {
- *       return {groupby_data_attribute::GROUPED_VALUES,
- *               groupby_data_attribute::GROUP_OFFSETS,
- *               cudf::make_max_aggregation<cudf::groupby_aggregation>()};
+ *     // Perform reduction computation using the input data and return the reduction result.
+ *     // This is where the actual reduction logic is implemented.
  *   }
  *
- *   [[nodiscard]] output_t get_empty_output(
- *     [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
- *     [[maybe_unused]] rmm::cuda_stream_view stream,
- *     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
- *     // This UDF aggregation always returns a column of type INT32.
- *     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
- *   [[nodiscard]] output_t operator()(input_map_t const& input,
- *                                     rmm::cuda_stream_view stream,
- *                                     rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
  *   {
- *     // Perform UDF computation using the input data and return the result.
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct reduce_host_udf : host_udf_base {
+  /**
+   * @brief Perform reduction operations.
+   *
+   * @param input The input column for reduction
+   * @param output_dtype The data type for the final output scalar
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
+   */
+  [[nodiscard]] virtual std::unique_ptr<scalar> operator()(
+    column_view const& input,
+    data_type output_dtype,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for segmented reduction context.
+ *
+ * An implementation of host-based UDF for segmented reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived class must also define the `operator()` function to perform segmented reduction.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::segmented_reduce_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     column_view const& input,
+ *     device_span<size_type const> offsets,
+ *     data_type output_dtype,
+ *     null_policy null_handling,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform computation using the input data and return the result.
+ *     // This is where the actual segmented reduction logic is implemented.
  *   }
  *
  *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
  *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
  *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
@@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf {
  * };
  * @endcode
  */
-struct host_udf_base {
-  host_udf_base()          = default;
-  virtual ~host_udf_base() = default;
-
+struct segmented_reduce_host_udf : host_udf_base {
   /**
-   * @brief Define the possible data needed for groupby aggregations.
+   * @brief Perform segmented reduction operations.
    *
-   * Note that only sort-based groupby aggregations are supported.
+   * @param input The input column for reduction
+   * @param offsets A list of offsets defining the segments for reduction
+   * @param output_dtype The data type for the final output column
+   * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in
+   *        the segment are valid, and if `EXCLUDE` then the reduction result is valid if any
+   *        element in the segment is valid
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  enum class groupby_data_attribute : int32_t {
-    INPUT_VALUES,    ///< The input values column.
-    GROUPED_VALUES,  ///< The input values grouped according to the input `keys` for which the
-                     ///< values within each group maintain their original order.
-    SORTED_GROUPED_VALUES,  ///< The input values grouped according to the input `keys` and
-                            ///< sorted within each group.
-    NUM_GROUPS,             ///< The number of groups (i.e., number of distinct keys).
-    GROUP_OFFSETS,          ///< The offsets separating groups.
-    GROUP_LABELS            ///< Group labels (which is also the same as group indices).
-  };
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    column_view const& input,
+    device_span<size_type const> offsets,
+    data_type output_dtype,
+    null_policy null_handling,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
 
+// Forward declaration.
+namespace groupby ::detail {
+struct aggregate_result_functor;
+}
+
+/**
+ * @brief The interface for host-based UDF implementation for groupby aggregation context.
+ *
+ * An implementation of host-based UDF for groupby needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such a derived class must also define the functions `get_empty_output()` to return result when
+ * the input is empty, and ``operator()`` to perform its groupby operations.
+ *
+ * During execution, the derived class can access internal data provided by the libcudf groupby
+ * framework through a set of ``get*`` accessors, as well as calling other built-in groupby
+ * aggregations through the ``compute_aggregation`` function.
+ *
+ * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby
+ * aggregations require more complex data structure and is not yet supported.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::groupby_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> get_empty_output(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Return a column corresponding to the result when the input values column is empty.
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform UDF computation using the input data and return the result.
+ *   }
+ *
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ *   {
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+ *   {
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct groupby_host_udf : host_udf_base {
   /**
-   * @brief Describe possible data that may be needed in the derived class for its operations.
+   * @brief Get the output when the input values column is empty.
    *
-   * Such data can be either intermediate data such as sorted values or group labels etc, or the
-   * results of other aggregations.
+   * This is called in libcudf when the input values column is empty. In such situations libcudf
+   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
    *
-   * Each derived host-based UDF class may need a different set of data. It is inefficient to
-   * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
-   * the derived class can define a subset of data that it needs and libcudf will evaluate
-   * and pass down only data requested from that set.
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation when the input values column is empty
    */
-  struct data_attribute {
-    /**
-     * @brief Hold all possible data types for the input of the aggregation in the derived class.
-     */
-    using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
-    value_type value;  ///< The actual data attribute, wrapped by this struct
-                       ///< as a wrapper is needed to define `hash` and `equal_to` functors.
-
-    data_attribute()                 = default;  ///< Default constructor
-    data_attribute(data_attribute&&) = default;  ///< Move constructor
-
-    /**
-     * @brief Construct a new data attribute from an aggregation attribute.
-     * @param value_ An aggregation attribute
-     */
-    template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
-    data_attribute(T value_) : value{value_}
-    {
-    }
-
-    /**
-     * @brief Construct a new data attribute from another aggregation request.
-     * @param value_ An aggregation request
-     */
-    template <typename T,
-              CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
-                             std::is_same_v<T, groupby_aggregation>)>
-    data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
-    {
-      CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
-                   "Invalid aggregation request.");
-      if constexpr (std::is_same_v<T, aggregation>) {
-        CUDF_EXPECTS(
-          dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
-          "Requesting results from other aggregations is only supported in groupby "
-          "aggregations.");
-      }
-    }
-
-    /**
-     * @brief Copy constructor.
-     * @param other The other data attribute to copy from
-     */
-    data_attribute(data_attribute const& other);
-
-    /**
-     * @brief Hash functor for `data_attribute`.
-     */
-    struct hash {
-      /**
-       * @brief Compute the hash value of a data attribute.
-       * @param attr The data attribute to hash
-       * @return The hash value of the data attribute
-       */
-      std::size_t operator()(data_attribute const& attr) const;
-    };  // struct hash
-
-    /**
-     * @brief Equality comparison functor for `data_attribute`.
-     */
-    struct equal_to {
-      /**
-       * @brief Check if two data attributes are equal.
-       * @param lhs The left-hand side data attribute
-       * @param rhs The right-hand side data attribute
-       * @return True if the two data attributes are equal
-       */
-      bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
-    };  // struct equal_to
-  };    // struct data_attribute
+  [[nodiscard]] virtual std::unique_ptr<column> get_empty_output(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
 
   /**
-   * @brief Set of attributes for the input data that is needed for computing the aggregation.
+   * @brief Perform the main groupby computation for the host-based UDF.
+   *
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  using data_attribute_set_t =
-    std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
+
+ private:
+  // Allow the struct `aggregate_result_functor` to set its private callback variables.
+  friend struct groupby::detail::aggregate_result_functor;
 
   /**
-   * @brief Return a set of attributes for the data that is needed for computing the aggregation.
-   *
-   * The derived class should return the attributes corresponding to only the data that it needs to
-   * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
-   * empty set is returned. That means all the data attributes (except results from other
-   * aggregations in groupby) will be needed.
-   *
-   * @return A set of `data_attribute`
+   * @brief Callback to access the input values column.
+   */
+  std::function<column_view(void)> callback_input_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys for which the
+   * values within each group maintain their original order.
+   */
+  std::function<column_view(void)> callback_grouped_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys and sorted
+   * within each group.
+   */
+  std::function<column_view(void)> callback_sorted_grouped_values;
+
+  /**
+   * @brief Callback to access the number of groups (i.e., number of distinct keys).
    */
-  [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+  std::function<size_type(void)> callback_num_groups;
 
   /**
-   * @brief Hold all possible types of the data that is passed to the derived class for executing
-   * the aggregation.
+   * @brief Callback to access the offsets separating groups.
    */
-  using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
+  std::function<device_span<size_type const>(void)> callback_group_offsets;
 
   /**
-   * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+   * @brief Callback to access the group labels (which is also the same as group indices).
    */
-  using input_map_t = std::
-    unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
+  std::function<device_span<size_type const>(void)> callback_group_labels;
 
   /**
-   * @brief Output type of the aggregation.
+   * @brief Callback to access the result from other groupby aggregations.
+   */
+  std::function<column_view(std::unique_ptr<aggregation>)> callback_compute_aggregation;
+
+ protected:
+  /**
+   * @brief Access the input values column.
    *
-   * Currently only a single type is supported as the output of the aggregation, but it will hold
-   * more type in the future when reduction is supported.
+   * @return The input values column.
    */
-  using output_t = std::variant<std::unique_ptr<column>>;
+  [[nodiscard]] column_view get_input_values() const
+  {
+    CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values.");
+    return callback_input_values();
+  }
 
   /**
-   * @brief Get the output when the input values column is empty.
+   * @brief Access the input values grouped according to the input keys for which the values
+   * within each group maintain their original order.
    *
-   * This is called in libcudf when the input values column is empty. In such situations libcudf
-   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+   * @return The grouped values column.
+   */
+  [[nodiscard]] column_view get_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values.");
+    return callback_grouped_values();
+  }
+
+  /**
+   * @brief Access the input values grouped according to the input keys and sorted within each
+   * group.
    *
-   * @param output_dtype The expected output data type
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation when input values is empty
+   * @return The sorted grouped values column.
    */
-  [[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] column_view get_sorted_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values.");
+    return callback_sorted_grouped_values();
+  }
 
   /**
-   * @brief Perform the main computation for the host-based UDF.
+   * @brief Access the number of groups (i.e., number of distinct keys).
    *
-   * @param input The input data needed for performing all computation
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation
+   * @return The number of groups.
    */
-  [[nodiscard]] virtual output_t operator()(input_map_t const& input,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] size_type get_num_groups() const
+  {
+    CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups.");
+    return callback_num_groups();
+  }
 
   /**
-   * @brief Computes hash value of the class's instance.
-   * @return The hash value of the instance
+   * @brief Access the offsets separating groups.
+   *
+   * @return The array of group offsets.
    */
-  [[nodiscard]] virtual std::size_t do_hash() const
+  [[nodiscard]] device_span<size_type const> get_group_offsets() const
   {
-    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+    CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets.");
+    return callback_group_offsets();
   }
 
   /**
-   * @brief Compares two instances of the derived class for equality.
-   * @param other The other derived class's instance to compare with
-   * @return True if the two instances are equal
+   * @brief Access the group labels (which is also the same as group indices).
+   *
+   * @return The array of group labels.
    */
-  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+  [[nodiscard]] device_span<size_type const> get_group_labels() const
+  {
+    CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels.");
+    return callback_group_labels();
+  }
 
   /**
-   * @brief Clones the instance.
+   * @brief Compute a built-in groupby aggregation and access its result.
    *
-   * A class derived from `host_udf_base` should not store too much data such that its instances
-   * remain lightweight for efficient cloning.
+   * This allows the derived class to call any other built-in groupby aggregations on the same input
+   * values column and access the output for its operations.
    *
-   * @return A new instance cloned from this
+   * @param other_agg An arbitrary built-in groupby aggregation
+   * @return A `column_view` object corresponding to the output result of the given aggregation
    */
-  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+  [[nodiscard]] column_view compute_aggregation(std::unique_ptr<aggregation> other_agg) const
+  {
+    CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation.");
+    return callback_compute_aggregation(std::move(other_agg));
+  }
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d873e93bd20..5574ed6ea6e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying host-based UDF aggregation.
  */
-class host_udf_aggregation final : public groupby_aggregation {
+class host_udf_aggregation final : public groupby_aggregation,
+                                   public reduce_aggregation,
+                                   public segmented_reduce_aggregation {
  public:
   std::unique_ptr<host_udf_base> udf_ptr;
 
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 2e3d71815c0..44a86f1c84f 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
  */
 
 #include <cudf/fixed_point/temporary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <cmath>
 #include <cstdlib>
@@ -44,13 +46,17 @@ namespace util {
  * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-constexpr S round_up_safe(S number_to_round, S modulus)
+CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error);
+#else
+    CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value");
+#endif
   }
   return rounded_up;
 }
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index e7b76946248..b5044a58934 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
-  /// Constructor from pointer and size
-  /// @param data Pointer to the first element in the span
-  /// @param size The number of elements in the span
-  /// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
-  constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
+  /**
+   * @brief Constructor from pointer and size
+   *
+   * @note This needs to be host-device , as it's used by a host-device function in base_2dspan
+   *
+   * @param data Pointer to the first element in the span
+   * @param size The number of elements in the span
+   * @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
+   */
+  CUDF_HOST_DEVICE constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
     : base(data, size), _is_device_accessible{is_device_accessible}
   {
   }
@@ -311,8 +316,8 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  [[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
-                                            typename base::size_type count) const noexcept
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr host_span subspan(
+    typename base::size_type offset, typename base::size_type count) const noexcept
   {
     return host_span{this->data() + offset, count, _is_device_accessible};
   }
@@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  [[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
-                                              typename base::size_type count) const noexcept
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr device_span subspan(
+    typename base::size_type offset, typename base::size_type count) const noexcept
   {
     return device_span{this->data() + offset, count};
   }
@@ -475,28 +480,28 @@ class base_2dspan {
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); }
 
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
-  [[nodiscard]] constexpr auto size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); }
 
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; }
 
   /**
    * @brief Returns a reference to the row-th element of the sequence.
@@ -507,7 +512,7 @@ class base_2dspan {
    * @param row the index of the element to access
    * @return A reference to the row-th element of the sequence, i.e., `data()[row]`
    */
-  constexpr RowType<T, dynamic_extent> operator[](size_t row) const
+  CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](size_t row) const
   {
     return _flat.subspan(row * _size.second, _size.second);
   }
@@ -517,7 +522,10 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> flat_view() const
+  {
+    return _flat;
+  }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
deleted file mode 100644
index ae48fed4e79..00000000000
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <nvtext/generate_ngrams.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace CUDF_EXPORT nvtext {
-namespace detail {
-
-/**
- * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for allocating/copying device memory and launching kernels
- */
-std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
-                                                    cudf::size_type ngrams,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::device_async_resource_ref mr);
-
-}  // namespace detail
-}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 54282b8ef3c..b2ba1798a8f 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  *
  * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param seed The seed value to use with the hash algorithm. Default is 0.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
@@ -124,6 +125,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 5,
+  uint32_t seed                     = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 4c90cd0eef5..6234148e9fa 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,8 +145,11 @@ struct empty_column_constructor {
     }
 
     if constexpr (k == aggregation::Kind::HOST_UDF) {
-      auto const& udf_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-      return std::get<std::unique_ptr<column>>(udf_ptr->get_empty_output(std::nullopt, stream, mr));
+      auto const& udf_base_ptr =
+        dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+      auto const udf_ptr = dynamic_cast<groupby_host_udf const*>(udf_base_ptr.get());
+      CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
+      return udf_ptr->get_empty_output(stream, mr);
     }
 
     return make_empty_column(target_type(values.type(), k));
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 6480070e85a..fb3f7559d64 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -795,58 +795,41 @@ void aggregate_result_functor::operator()<aggregation::HOST_UDF>(aggregation con
 {
   if (cache.has_result(values, agg)) { return; }
 
-  auto const& udf_ptr   = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-  auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t {
-    if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; }
-    // Empty attribute set means everything.
-    return {host_udf_base::groupby_data_attribute::INPUT_VALUES,
-            host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::NUM_GROUPS,
-            host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-            host_udf_base::groupby_data_attribute::GROUP_LABELS};
-  }();
+  auto const& udf_base_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+  auto const udf_ptr       = dynamic_cast<groupby_host_udf*>(udf_base_ptr.get());
+  CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
 
-  // Do not cache udf_input, as the actual input data may change from run to run.
-  host_udf_base::input_map_t udf_input;
-  for (auto const& attr : data_attrs) {
-    CUDF_EXPECTS(std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value) ||
-                   std::holds_alternative<std::unique_ptr<aggregation>>(attr.value),
-                 "Invalid input data attribute for HOST_UDF groupby aggregation.");
-    if (std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value)) {
-      switch (std::get<host_udf_base::groupby_data_attribute>(attr.value)) {
-        case host_udf_base::groupby_data_attribute::INPUT_VALUES:
-          udf_input.emplace(attr, values);
-          break;
-        case host_udf_base::groupby_data_attribute::GROUPED_VALUES:
-          udf_input.emplace(attr, get_grouped_values());
-          break;
-        case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES:
-          udf_input.emplace(attr, get_sorted_values());
-          break;
-        case host_udf_base::groupby_data_attribute::NUM_GROUPS:
-          udf_input.emplace(attr, helper.num_groups(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_OFFSETS:
-          udf_input.emplace(attr, helper.group_offsets(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_LABELS:
-          udf_input.emplace(attr, helper.group_labels(stream));
-          break;
-        default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation.");
-      }
-    } else {  // data is result from another aggregation
-      auto other_agg = std::get<std::unique_ptr<aggregation>>(attr.value)->clone();
+  if (!udf_ptr->callback_input_values) {
+    udf_ptr->callback_input_values = [&]() -> column_view { return values; };
+  }
+  if (!udf_ptr->callback_grouped_values) {
+    udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); };
+  }
+  if (!udf_ptr->callback_sorted_grouped_values) {
+    udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); };
+  }
+  if (!udf_ptr->callback_num_groups) {
+    udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); };
+  }
+  if (!udf_ptr->callback_group_offsets) {
+    udf_ptr->callback_group_offsets = [&]() -> device_span<size_type const> {
+      return helper.group_offsets(stream);
+    };
+  }
+  if (!udf_ptr->callback_group_labels) {
+    udf_ptr->callback_group_labels = [&]() -> device_span<size_type const> {
+      return helper.group_labels(stream);
+    };
+  }
+  if (!udf_ptr->callback_compute_aggregation) {
+    udf_ptr->callback_compute_aggregation =
+      [&](std::unique_ptr<aggregation> other_agg) -> column_view {
       cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg);
-      auto result = cache.get_result(values, *other_agg);
-      udf_input.emplace(std::move(other_agg), std::move(result));
-    }
+      return cache.get_result(values, *other_agg);
+    };
   }
 
-  auto output = (*udf_ptr)(udf_input, stream, mr);
-  CUDF_EXPECTS(std::holds_alternative<std::unique_ptr<column>>(output),
-               "Invalid output type from HOST_UDF groupby aggregation.");
-  cache.add_result(values, agg, std::get<std::unique_ptr<column>>(std::move(output)));
+  cache.add_result(values, agg, (*udf_ptr)(stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp
index 0da47e17f48..6f1fe80c4bd 100644
--- a/cpp/src/groupby/sort/host_udf_aggregation.cpp
+++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,51 +16,9 @@
 
 #include <cudf/aggregation/host_udf.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 
 namespace cudf {
 
-host_udf_base::data_attribute::data_attribute(data_attribute const& other)
-  : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; },
-                                                    [](std::unique_ptr<aggregation> const& val) {
-                                                      return value_type{val->clone()};
-                                                    }},
-                     other.value)}
-{
-}
-
-std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const
-{
-  auto const hash_value =
-    std::visit(cudf::detail::visitor_overload{
-                 [](auto const& val) { return std::hash<int>{}(static_cast<int>(val)); },
-                 [](std::unique_ptr<aggregation> const& val) { return val->do_hash(); }},
-               attr.value);
-  return std::hash<std::size_t>{}(attr.value.index()) ^ hash_value;
-}
-
-bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs,
-                                                         data_attribute const& rhs) const
-{
-  auto const& lhs_val = lhs.value;
-  auto const& rhs_val = rhs.value;
-  if (lhs_val.index() != rhs_val.index()) { return false; }
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [](auto const& lhs_val, auto const& rhs_val) {
-        if constexpr (std::is_same_v<decltype(lhs_val), decltype(rhs_val)>) {
-          return lhs_val == rhs_val;
-        } else {
-          return false;
-        }
-      },
-      [](std::unique_ptr<aggregation> const& lhs_val, std::unique_ptr<aggregation> const& rhs_val) {
-        return lhs_val->is_equal(*rhs_val);
-      }},
-    lhs_val,
-    rhs_val);
-}
-
 namespace detail {
 
 host_udf_aggregation::host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
@@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr<aggregation> make_host_udf_aggregation<aggr
   std::unique_ptr<host_udf_base>);
 template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
   make_host_udf_aggregation<groupby_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_host_udf_aggregation<reduce_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
+  make_host_udf_aggregation<segmented_reduce_aggregation>(std::unique_ptr<host_udf_base>);
 
 }  // namespace cudf
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 82d8152ca1c..113342e9cbf 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -37,12 +38,25 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
+#include <BS_thread_pool.hpp>
+#include <BS_thread_pool_utils.hpp>
+
 #include <numeric>
 
 namespace cudf::io::json::detail {
 
 namespace {
 
+namespace pools {
+
+BS::thread_pool& tpool()
+{
+  static BS::thread_pool _tpool(std::thread::hardware_concurrency());
+  return _tpool;
+}
+
+}  // namespace pools
+
 class compressed_host_buffer_source final : public datasource {
  public:
   explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
@@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource {
   {
     auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
                                               _dbuf_ptr->size());
-    if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
-        comptype == compression_type::SNAPPY) {
+    if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP ||
+        _comptype == compression_type::SNAPPY) {
       _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
     } else {
       _decompressed_buffer         = cudf::io::detail::decompress(_comptype, ch_buffer);
@@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource {
     return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
   }
 
-  [[nodiscard]] bool supports_device_read() const override { return false; }
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    auto& thread_pool = pools::tpool();
+    return thread_pool.submit_task([this, offset, size, dst, stream] {
+      auto hbuf = host_read(offset, size);
+      CUDF_CUDA_TRY(
+        cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value()));
+      stream.synchronize();
+      return hbuf->size();
+    });
+  }
+
+  [[nodiscard]] bool supports_device_read() const override { return true; }
 
   [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
 
@@ -431,6 +460,8 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
+  std::vector<std::future<size_t>> thread_tasks;
+  auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count());
 
   auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
   std::vector<std::size_t> prefsum_source_sizes(sources.size());
@@ -447,13 +478,17 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
 
   auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
   range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-  for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
+  for (std::size_t i = start_source, cur_stream = 0;
+       i < sources.size() && bytes_read < total_bytes_to_read;
+       i++) {
     if (sources[i]->is_empty()) continue;
     auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
     auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
                        (num_delimiter_chars * delimiter_map.size());
-    if (sources[i]->is_device_read_preferred(data_size)) {
-      bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+    if (sources[i]->supports_device_read()) {
+      thread_tasks.emplace_back(sources[i]->device_read_async(
+        range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()]));
+      bytes_read += data_size;
     } else {
       h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
       auto const& h_buffer = h_buffers.back();
@@ -481,6 +516,15 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     buffer.data());
   }
   stream.synchronize();
+
+  if (thread_tasks.size()) {
+    auto const bytes_read = std::accumulate(
+      thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) {
+        return sum + task.get();
+      });
+    CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy");
+  }
+
   return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
 }
 
@@ -505,10 +549,17 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     return read_json_impl(sources, reader_opts, stream, mr);
 
   std::vector<std::unique_ptr<datasource>> compressed_sources;
-  for (size_t i = 0; i < sources.size(); i++) {
-    compressed_sources.emplace_back(
-      std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
+  std::vector<std::future<std::unique_ptr<compressed_host_buffer_source>>> thread_tasks;
+  auto& thread_pool = pools::tpool();
+  for (auto& src : sources) {
+    thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] {
+      return std::make_unique<compressed_host_buffer_source>(src, reader_opts.get_compression());
+    }));
   }
+  std::transform(thread_tasks.begin(),
+                 thread_tasks.end(),
+                 std::back_inserter(compressed_sources),
+                 [](auto& task) { return task.get(); });
   // in read_json_impl, we need the compressed source size to actually be the
   // uncompressed source size for correct batching
   return read_json_impl(compressed_sources, reader_opts, stream, mr);
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index bed1d5500d2..dff1f3f0c0e 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                              _stream);
     }
 
-    // Compute column string sizes (using page string offsets) for this subpass
+    // Compute column string sizes (using page string offsets) for this output table chunk
     col_string_sizes = calculate_page_string_offsets();
 
-    // ensure cumulative column string sizes have been initialized
-    if (pass.cumulative_col_string_sizes.empty()) {
-      pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
-    }
-
-    // Add to the cumulative column string sizes of this pass
-    std::transform(pass.cumulative_col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.end(),
-                   col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.begin(),
-                   std::plus<>{});
-
     // Check for overflow in cumulative column string sizes of this pass so that the page string
     // offsets of overflowing (large) string columns are treated as 64-bit.
     auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
-    auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
-                                               pass.cumulative_col_string_sizes.cend(),
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
                                                [=](std::size_t sz) { return sz > threshold; });
     if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
 
-    // Mark any chunks for which the cumulative column string size has exceeded the
-    // large strings threshold
-    if (has_large_strings) {
-      for (auto& chunk : pass.chunks) {
-        auto const idx = chunk.src_col_index;
-        if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
-      }
+    // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
+    // column chunks and the large strings threshold.
+    for (auto& chunk : pass.chunks) {
+      auto const idx            = chunk.src_col_index;
+      chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
     }
   }
 
@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         // only do string buffer for leaf
         if (idx == max_depth - 1 and out_buf.string_size() == 0 and
             col_string_sizes[pass.chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(
-            col_string_sizes[pass.chunks[c].src_col_index],
-            pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
-              static_cast<size_t>(strings::detail::get_offset64_threshold()),
-            _stream);
+          out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
+                                     pass.chunks[c].is_large_string_col,
+                                     _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -450,11 +434,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
-        // need to cap off the string offsets column
-        auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        if (sz <= strings::detail::get_offset64_threshold()) {
+        // only if it is not a large strings column
+        if (col_string_sizes[idx] <=
+            static_cast<size_t>(strings::detail::get_offset64_threshold())) {
           out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
-          final_offsets.emplace_back(sz);
+          final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
         }
       }
     }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index ca46f198bb8..4a773fbced1 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,9 +130,6 @@ struct pass_intermediate_data {
   rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
   rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
 
-  // cumulative strings column sizes.
-  std::vector<size_t> cumulative_col_string_sizes{};
-
   int level_type_size{0};
 
   // skip_rows / num_rows for this pass.
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 75e45a68842..9833dab282e 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,7 +171,10 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag)
 }
 
 // Converts character to lowercase.
-constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; }
+CUDF_HOST_DEVICE constexpr char to_lower(char const c)
+{
+  return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
+}
 
 /**
  * @brief Checks if string is infinity, case insensitive with/without sign
@@ -515,13 +518,13 @@ struct ConvertFunctor {
   template <typename T,
             CUDF_ENABLE_IF(std::is_integral_v<T> and !std::is_same_v<T, bool> and
                            !cudf::is_fixed_point<T>())>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex = false)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex = false)
   {
     auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
@@ -564,13 +567,13 @@ struct ConvertFunctor {
    * @brief Dispatch for boolean type types.
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, bool>)>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex)
   {
     auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
@@ -593,13 +596,13 @@ struct ConvertFunctor {
    * is not valid. In such case, the validity mask is set to zero too.
    */
   template <typename T, CUDF_ENABLE_IF(std::is_floating_point_v<T>)>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex)
   {
     auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index c0efc5b6f20..dbdc4a34277 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,16 +74,14 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
 /*
  * @brief Searches for a string in a serialized trie.
  *
- * Can be executed on host or device, as long as the data is available
- *
  * @param trie Pointer to the array of nodes that make up the trie
  * @param key Pointer to the start of the string to find
  * @param key_len Length of the string to find
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                      device_span<char const> key)
+__device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 75ebc078930..928625a7e8f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,6 +145,13 @@ struct reduce_dispatch_functor {
         auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction.");
+        return (*udf_ptr)(col, output_dtype, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported reduction operator");
     }
   }
@@ -161,9 +169,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   // Returns default scalar if input column is empty or all null
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1c3a2b0c0f3..5835bfcf0a1 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor {
       }
       case segmented_reduce_aggregation::NUNIQUE:
         return segmented_nunique(col, offsets, null_handling, stream, mr);
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<segmented_reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction.");
+        return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
@@ -117,9 +126,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   if (segmented_values.is_empty() && offsets.empty()) {
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 997b0278fe2..33d52ccd570 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
+#include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -315,6 +315,7 @@ namespace {
  */
 CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
                                              cudf::size_type ngrams,
+                                             uint32_t seed,
                                              cudf::size_type const* d_ngram_offsets,
                                              cudf::hash_value_type* d_results)
 {
@@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
   __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
 
   auto const ngram_offset = d_ngram_offsets[str_idx];
-  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seed};
 
   auto const end        = d_str.data() + d_str.size_bytes();
   auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
@@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
@@ -400,7 +402,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
   character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, ngrams, d_offsets, d_hashes);
+    *d_strings, ngrams, seed, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
@@ -419,11 +421,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
+  return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 344979e1288..35877ac34b9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -220,11 +220,12 @@ ConfigureTest(
   REDUCTIONS_TEST
   reductions/collect_ops_tests.cpp
   reductions/ewm_tests.cpp
+  reductions/host_udf_example_tests.cu
+  reductions/list_rank_test.cpp
   reductions/rank_tests.cpp
   reductions/reduction_tests.cpp
   reductions/scan_tests.cpp
   reductions/segmented_reduction_tests.cpp
-  reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
   GPUS 1
   PERCENT 70
diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu
index a454bd692fc..e1ded37d8a7 100644
--- a/cpp/tests/groupby/host_udf_example_tests.cu
+++ b/cpp/tests/groupby/host_udf_example_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/reduction.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -34,6 +32,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+
 namespace {
 /**
  * @brief A host-based UDF implementation for groupby.
@@ -41,42 +42,21 @@ namespace {
  * For each group of values, the aggregation computes
  * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`.
  */
-struct host_udf_groupby_example : cudf::host_udf_base {
+struct host_udf_groupby_example : cudf::groupby_host_udf {
   host_udf_groupby_example() = default;
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override
-  {
-    // We need grouped values, group offsets, group labels, and also results from groups'
-    // MAX and SUM aggregations.
-    return {groupby_data_attribute::GROUPED_VALUES,
-            groupby_data_attribute::GROUP_OFFSETS,
-            groupby_data_attribute::GROUP_LABELS,
-            cudf::make_max_aggregation<cudf::groupby_aggregation>(),
-            cudf::make_sum_aggregation<cudf::groupby_aggregation>()};
-  }
-
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
-    [[maybe_unused]] rmm::cuda_stream_view stream,
-    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
+    rmm::cuda_stream_view, rmm::device_async_resource_ref) const override
   {
     return cudf::make_empty_column(
       cudf::data_type{cudf::type_to_id<typename groupby_fn::OutputType>()});
   }
 
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
   {
-    auto const& values =
-      std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-    return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr);
-  }
-
-  [[nodiscard]] std::size_t do_hash() const override
-  {
-    // Just return the same hash for all instances of this class.
-    return std::size_t{12345};
+    auto const values = get_grouped_values();
+    return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr);
   }
 
   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
@@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
   struct groupby_fn {
     // Store pointer to the parent class so we can call its functions.
-    host_udf_groupby_example const* parent;
+    host_udf_groupby_example const& parent;
 
-    // For simplicity, this example only accepts double input and always produces double output.
+    // For simplicity, this example only accepts a single type input and output.
     using InputType  = double;
     using OutputType = double;
 
     template <typename T, typename... Args, CUDF_ENABLE_IF(!std::is_same_v<InputType, T>)>
-    output_t operator()(Args...) const
+    std::unique_ptr<cudf::column> operator()(Args...) const
     {
       CUDF_FAIL("Unsupported input type.");
     }
 
     template <typename T, CUDF_ENABLE_IF(std::is_same_v<InputType, T>)>
-    output_t operator()(input_map_t const& input,
-                        rmm::cuda_stream_view stream,
-                        rmm::device_async_resource_ref mr) const
+    std::unique_ptr<cudf::column> operator()(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr) const
     {
-      auto const& values =
-        std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-      if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); }
+      auto const values = parent.get_grouped_values();
+      if (values.size() == 0) { return parent.get_empty_output(stream, mr); }
 
-      auto const offsets = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_OFFSETS));
+      auto const offsets = parent.get_group_offsets();
       CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
       auto const num_groups    = static_cast<int>(offsets.size()) - 1;
-      auto const group_indices = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_LABELS));
-      auto const group_max = std::get<cudf::column_view>(
-        input.at(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
-      auto const group_sum = std::get<cudf::column_view>(
-        input.at(cudf::make_sum_aggregation<cudf::groupby_aggregation>()));
+      auto const group_indices = parent.get_group_labels();
+      auto const group_max =
+        parent.compute_aggregation(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      auto const group_sum =
+        parent.compute_aggregation(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
       auto const values_dv_ptr = cudf::column_device_view::create(values, stream);
       auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<OutputType>()},
@@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
 }  // namespace
 
-using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
-using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
-
 struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {};
 
 TEST_F(HostUDFGroupbyExampleTest, SimpleInput)
diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp
index 1a0f68c0c6c..17da28cdefc 100644
--- a/cpp/tests/groupby/host_udf_tests.cpp
+++ b/cpp/tests/groupby/host_udf_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,178 +26,121 @@
 #include <vector>
 
 namespace {
+
 /**
- * @brief A host-based UDF implementation used for unit tests.
+ * @brief Generate a random aggregation object from {min, max, sum, product}.
  */
-struct host_udf_test_base : cudf::host_udf_base {
+std::unique_ptr<cudf::aggregation> get_random_agg()
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> distr(1, 4);
+  switch (distr(gen)) {
+    case 1: return cudf::make_min_aggregation();
+    case 2: return cudf::make_max_aggregation();
+    case 3: return cudf::make_sum_aggregation();
+    case 4: return cudf::make_product_aggregation();
+    default: CUDF_UNREACHABLE("This should not be reached.");
+  }
+  return nullptr;
+}
+
+/**
+ * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
+ */
+struct host_udf_groupby_test : cudf::groupby_host_udf {
   int test_location_line;  // the location where testing is called
   bool* test_run;          // to check if the test is accidentally skipped
-  data_attribute_set_t input_attrs;
+  bool test_other_agg;     // test calling other aggregation
 
-  host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_)
-    : test_location_line{test_location_line_},
-      test_run{test_run_},
-      input_attrs(std::move(input_attrs_))
+  host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_)
+    : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_}
   {
   }
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; }
-
-  // This is the main testing function, which checks for the correctness of input data.
-  // The rests are just to satisfy the interface.
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::size_t do_hash() const override { return 0; }
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
   {
-    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
-
-    test_data_attributes(input, stream, mr);
-
-    *test_run = true;  // test is run successfully
-    return get_empty_output(std::nullopt, stream, mr);
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_groupby_test const*>(&other) != nullptr;
+  }
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, test_other_agg);
   }
 
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
     [[maybe_unused]] rmm::cuda_stream_view stream,
     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
   {
-    // Unused function - dummy output.
+    // Dummy output.
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
   }
 
-  [[nodiscard]] std::size_t do_hash() const override { return 0; }
-  [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; }
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
+  {
+    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
 
-  // The main test function, which must be implemented for each kind of aggregations
-  // (groupby/reduction/segmented_reduction).
-  virtual void test_data_attributes(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const = 0;
-};
+    // Perform tests on types for the groupby data: we must ensure the data corresponding to each
+    // `groupby_data` enum having the correct type.
 
-/**
- * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
- */
-struct host_udf_groupby_test : host_udf_test_base {
-  host_udf_groupby_test(int test_location_line_,
-                        bool* test_run_,
-                        data_attribute_set_t input_attrs_ = {})
-    : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_))
-  {
-  }
+    {
+      auto const inp_data = get_input_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
-  {
-    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, input_attrs);
-  }
+    {
+      auto const inp_data = get_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  void test_data_attributes(input_map_t const& input,
-                            rmm::cuda_stream_view stream,
-                            rmm::device_async_resource_ref mr) const override
-  {
-    data_attribute_set_t check_attrs = input_attrs;
-    if (check_attrs.empty()) {
-      check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES,
-                                         groupby_data_attribute::GROUPED_VALUES,
-                                         groupby_data_attribute::SORTED_GROUPED_VALUES,
-                                         groupby_data_attribute::NUM_GROUPS,
-                                         groupby_data_attribute::GROUP_OFFSETS,
-                                         groupby_data_attribute::GROUP_LABELS};
+    {
+      auto const inp_data = get_sorted_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
     }
-    EXPECT_EQ(input.size(), check_attrs.size());
-    for (auto const& attr : check_attrs) {
-      EXPECT_TRUE(input.count(attr) > 0);
-      EXPECT_TRUE(std::holds_alternative<groupby_data_attribute>(attr.value) ||
-                  std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value));
-      if (std::holds_alternative<groupby_data_attribute>(attr.value)) {
-        switch (std::get<groupby_data_attribute>(attr.value)) {
-          case groupby_data_attribute::INPUT_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::SORTED_GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::NUM_GROUPS:
-            EXPECT_TRUE(std::holds_alternative<cudf::size_type>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_OFFSETS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_LABELS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          default:;
-        }
-      } else {  // std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value)
-        EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-      }
+
+    {
+      auto const inp_data = get_num_groups();
+      EXPECT_TRUE((std::is_same_v<cudf::size_type, std::decay_t<decltype(inp_data)>>));
     }
-  }
-};
 
-/**
- * @brief Get a random subset of input data attributes.
- */
-cudf::host_udf_base::data_attribute_set_t get_subset(
-  cudf::host_udf_base::data_attribute_set_t const& attrs)
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<std::size_t> size_distr(1, attrs.size() - 1);
-  auto const subset_size = size_distr(gen);
-  auto const elements =
-    std::vector<cudf::host_udf_base::data_attribute>(attrs.begin(), attrs.end());
-  std::uniform_int_distribution<std::size_t> idx_distr(0, attrs.size() - 1);
-  cudf::host_udf_base::data_attribute_set_t output;
-  while (output.size() < subset_size) {
-    output.insert(elements[idx_distr(gen)]);
-  }
-  return output;
-}
+    {
+      auto const inp_data = get_group_offsets();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
 
-/**
- * @brief Generate a random aggregation object from {min, max, sum, product}.
- */
-std::unique_ptr<cudf::aggregation> get_random_agg()
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<int> distr(1, 4);
-  switch (distr(gen)) {
-    case 1: return cudf::make_min_aggregation();
-    case 2: return cudf::make_max_aggregation();
-    case 3: return cudf::make_sum_aggregation();
-    case 4: return cudf::make_product_aggregation();
-    default: CUDF_UNREACHABLE("This should not be reached.");
+    {
+      auto const inp_data = get_group_labels();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
+
+    // Perform tests on type of the result from computing other aggregations.
+    if (test_other_agg) {
+      auto const inp_data = compute_aggregation(get_random_agg());
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
+
+    *test_run = true;  // test is run successfully
+    return get_empty_output(stream, mr);
   }
-  return nullptr;
-}
+};
 
 }  // namespace
 
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 
-// Number of randomly testing on the input data attributes.
-// For each test, a subset of data attributes will be randomly generated from all the possible input
-// data attributes. The input data corresponding to that subset passed from libcudf will be tested
-// for correctness.
-constexpr int NUM_RANDOM_TESTS = 20;
-
 struct HostUDFTest : cudf::test::BaseFixture {};
 
-TEST_F(HostUDFTest, GroupbyAllInput)
+TEST_F(HostUDFTest, GroupbyBuiltinInput)
 {
   bool test_run   = false;
   auto const keys = int32s_col{0, 1, 2};
   auto const vals = int32s_col{0, 1, 2};
   auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run));
+    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ false));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back();
@@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput)
   requests[0].aggregations.push_back(std::move(agg));
   cudf::groupby::groupby gb_obj(
     cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-  [[maybe_unused]] auto const grp_result =
-    gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+    requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_TRUE(test_run);
 }
 
-TEST_F(HostUDFTest, GroupbySomeInput)
+TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations)
 {
-  auto const keys      = int32s_col{0, 1, 2};
-  auto const vals      = int32s_col{0, 1, 2};
-  auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{
-    cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  auto const keys = int32s_col{0, 1, 2};
+  auto const vals = int32s_col{0, 1, 2};
+
+  constexpr int NUM_RANDOM_TESTS = 20;
+
   for (int i = 0; i < NUM_RANDOM_TESTS; ++i) {
-    bool test_run    = false;
-    auto input_attrs = get_subset(all_attrs);
-    input_attrs.insert(get_random_agg());
-    auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, std::move(input_attrs)));
+    bool test_run = false;
+    auto agg      = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ true));
 
     std::vector<cudf::groupby::aggregation_request> requests;
     requests.emplace_back();
@@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput)
     requests[0].aggregations.push_back(std::move(agg));
     cudf::groupby::groupby gb_obj(
       cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-    [[maybe_unused]] auto const grp_result =
-      gb_obj.aggregate(requests, cudf::test::get_default_stream());
+    [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+      requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     EXPECT_TRUE(test_run);
   }
 }
diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu
new file mode 100644
index 00000000000..67b88c5306b
--- /dev/null
+++ b/cpp/tests/reductions/host_udf_example_tests.cu
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+namespace {
+/**
+ * @brief A host-based UDF implementation for reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_reduction_example : cudf::reduce_host_udf {
+  host_udf_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+    cudf::column_view const& input,
+    cudf::data_type output_dtype,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(
+      input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_reduction_example>();
+  }
+
+  struct reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::scalar> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+      cudf::column_view const& input,
+      cudf::data_type output_dtype,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      if (input.size() == 0) {
+        return cudf::make_default_constructed_scalar(output_dtype, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto const result       = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                   thrust::make_counting_iterator(0),
+                                                   thrust::make_counting_iterator(input.size()),
+                                                   transform_fn{*input_dv_ptr},
+                                                   static_cast<OutputType>(init_value),
+                                                   thrust::plus<>{});
+
+      auto output = cudf::make_numeric_scalar(output_dtype, stream, mr);
+      static_cast<cudf::scalar_type_t<OutputType>*>(output.get())->set_value(result, stream);
+      return output;
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      OutputType __device__ operator()(cudf::size_type idx) const
+      {
+        if (values.is_null(idx)) { return OutputType{0}; }
+        auto const val = static_cast<OutputType>(values.element<InputType>(idx));
+        return val * val;
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFReductionExampleTest, SimpleInput)
+{
+  auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_TRUE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+  auto const result =
+    static_cast<cudf::scalar_type_t<int64_t>*>(reduced.get())->value(cudf::get_default_stream());
+  auto constexpr expected = 55;  // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55
+  EXPECT_EQ(expected, result);
+}
+
+TEST_F(HostUDFReductionExampleTest, EmptyInput)
+{
+  auto const vals = doubles_col{};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_FALSE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+}
+
+namespace {
+
+/**
+ * @brief A host-based UDF implementation for segmented reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf {
+  host_udf_segmented_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    cudf::column_view const& input,
+    cudf::device_span<cudf::size_type const> offsets,
+    cudf::data_type output_dtype,
+    cudf::null_policy null_handling,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(input.type(),
+                                        output_dtype,
+                                        segmented_reduce_fn{},
+                                        input,
+                                        offsets,
+                                        output_dtype,
+                                        null_handling,
+                                        init,
+                                        stream,
+                                        mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_segmented_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_segmented_reduction_example>();
+  }
+
+  struct segmented_reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(
+      cudf::column_view const& input,
+      cudf::device_span<cudf::size_type const> offsets,
+      cudf::data_type output_dtype,
+      cudf::null_policy null_handling,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
+      auto const num_segments = static_cast<cudf::size_type>(offsets.size()) - 1;
+
+      if (input.size() == 0) {
+        if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); }
+        return cudf::make_numeric_column(
+          output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto output             = cudf::make_numeric_column(
+        output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream);
+
+      // Store row index if it is valid, otherwise store a negative value denoting a null row.
+      rmm::device_uvector<cudf::size_type> valid_idx(num_segments, stream);
+
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_segments),
+        thrust::make_zip_iterator(output->mutable_view().begin<OutputType>(), valid_idx.begin()),
+        transform_fn{*input_dv_ptr, offsets, static_cast<OutputType>(init_value), null_handling});
+
+      auto const valid_idx_cv = cudf::column_view{
+        cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0};
+      return std::move(cudf::gather(cudf::table_view{{output->view()}},
+                                    valid_idx_cv,
+                                    cudf::out_of_bounds_policy::NULLIFY,
+                                    stream,
+                                    mr)
+                         ->release()
+                         .front());
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      cudf::device_span<cudf::size_type const> offsets;
+      OutputType init_value;
+      cudf::null_policy null_handling;
+
+      thrust::tuple<OutputType, cudf::size_type> __device__ operator()(cudf::size_type idx) const
+      {
+        auto const start = offsets[idx];
+        auto const end   = offsets[idx + 1];
+
+        auto constexpr invalid_idx = cuda::std::numeric_limits<cudf::size_type>::lowest();
+        if (start == end) { return {OutputType{0}, invalid_idx}; }
+
+        auto sum = init_value;
+        for (auto i = start; i < end; ++i) {
+          if (values.is_null(i)) {
+            if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; }
+            continue;
+          }
+          auto const val = static_cast<OutputType>(values.element<InputType>(i));
+          sum += val * val;
+        }
+        auto const segment_size = end - start;
+        return {static_cast<OutputType>(segment_size) * sum, idx};
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput)
+{
+  double constexpr null = 0.0;
+  auto const vals       = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0},
+                                      {true, false, true, true, false, true, false, false, true, true}};
+  auto const offsets    = int32s_col{0, 3, 5, 10}.release();
+  auto const agg        = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+
+  // Test without init value.
+  {
+    auto const result = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      std::nullopt,  // init value
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // Since init value is not given, it is set to 0.
+    // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{12, 18, 850}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and include nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{48, 42, 955}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and exclude nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::EXCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{21, 24, 865}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{0, 0, 0, 0}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{{0, 0, 0}, {false, false, false}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
index 221c0a62f3e..47b9ac46d12 100644
--- a/cpp/tests/streams/text/ngrams_test.cpp
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams)
   auto input =
     cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
   nvtext::hash_character_ngrams(
-    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+    cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream());
 }
 
 TEST_F(TextNGramsTest, NgramsTokenize)
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c72c7cfc80e..1a737231389 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash)
                     2319357747u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::hash_character_ngrams(view, 10, 10);
+  // clang-format off
+  LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u,
+                     2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u},
+                 LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u,
+                     3759561568u, 1092423314u,  339538635u, 4265577390u,  879551618u,
+                     4222824617u, 1774528854u, 1028254379u,  485918316u,  879142987u, 3619248543u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu
index 652b9053582..0e4f623f0a2 100644
--- a/cpp/tests/transform/segmented_row_bit_count_test.cu
+++ b/cpp/tests/transform/segmented_row_bit_count_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type s
         // Since the number of rows may not divisible by segment_length,
         // the last segment may be shorter than the others.
         auto const size_begin = d_sizes + segment_idx * segment_length;
-        auto const size_end   = std::min(size_begin + segment_length, d_sizes + num_rows);
+        auto const size_end   = cuda::std::min(size_begin + segment_length, d_sizes + num_rows);
         return thrust::reduce(thrust::seq, size_begin, size_end);
       }));
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index fb9bdeb0b22..6888f26fd16 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
@@ -412,14 +414,16 @@ class corresponding_rows_not_equivalent {
         T const y = rhs.element<T>(rhs_index);
 
         // Must handle inf and nan separately
-        if (std::isinf(x) || std::isinf(y)) {
+        if (cuda::std::isinf(x) || cuda::std::isinf(y)) {
           return x != y;  // comparison of (inf==inf) returns true
-        } else if (std::isnan(x) || std::isnan(y)) {
-          return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
+        } else if (cuda::std::isnan(x) || cuda::std::isnan(y)) {
+          return cuda::std::isnan(x) !=
+                 cuda::std::isnan(y);  // comparison of (nan==nan) returns false
         } else {
-          T const abs_x_minus_y = std::abs(x - y);
-          return abs_x_minus_y >= std::numeric_limits<T>::min() &&
-                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * fp_ulps;
+          T const abs_x_minus_y = cuda::std::abs(x - y);
+          return abs_x_minus_y >= cuda::std::numeric_limits<T>::min() &&
+                 abs_x_minus_y >
+                   cuda::std::numeric_limits<T>::epsilon() * cuda::std::abs(x + y) * fp_ulps;
         }
       } else {
         // if either is null, then the inequality was checked already
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 2276b223740..c07a58ed8a5 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 27966ddfdd4..234a9ec1ced 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index ba8ae379bae..4f047a68f06 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -161,14 +161,14 @@ public static ReductionAggregation median() {
   /**
    * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
    */
-  public static ReductionAggregation quantile(double ... quantiles) {
+  public static ReductionAggregation quantile(double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(quantiles));
   }
 
   /**
    * Aggregate to compute various quantiles.
    */
-  public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) {
+  public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(method, quantiles));
   }
 
@@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() {
    * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
    */
   public static ReductionAggregation collectSet(NullPolicy nullPolicy,
-      NullEquality nullEquality, NaNEquality nanEquality) {
+                                                NullEquality nullEquality, NaNEquality nanEquality) {
     return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
   }
 
@@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new ReductionAggregation instance
+   */
+  public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new ReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
+
   /**
    * Create HistogramAggregation, computing the frequencies for each unique row.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
index 7ed150a2fec..18e7d874886 100644
--- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() {
   public static SegmentedReductionAggregation all() {
     return new SegmentedReductionAggregation(Aggregation.all());
   }
+
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new SegmentedReductionAggregation instance
+   */
+  public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
 }
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index ff6fba1c3e8..ec44a6aa8c5 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
+set(cython_sources column.pyx scalar.pyx strings_udf.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8b1d16f0d85..026c12895e8 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport (
 from pylibcudf.libcudf.types cimport size_type
 from rmm.librmm.device_buffer cimport device_buffer
 
+cdef dtype_from_lists_column_view(column_view cv)
+cdef dtype_from_column_view(column_view cv)
 
 cdef class Column:
     cdef public:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f7dcd89ea48..c59bbc0f40c 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 
 from typing import Literal
@@ -19,24 +19,21 @@ from cudf.core.buffer import (
     as_buffer,
     cuda_array_interface_wrapper,
 )
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.utils.dtypes import (
+    _get_base_dtype,
+    dtype_to_pylibcudf_type,
+    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+)
 
 from cpython.buffer cimport PyObject_CheckBuffer
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libc.stdint cimport uintptr_t, int32_t
+from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
-from cudf._lib.types cimport (
-    dtype_from_column_view,
-    dtype_to_pylibcudf_type,
-)
-
-from cudf._lib.types import dtype_from_pylibcudf_column
-
-from pylibcudf cimport DataType as plc_DataType
+from pylibcudf cimport DataType as plc_DataType, Column as plc_Column
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
     make_numeric_column
 )
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
@@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index):
     )
 
 
+def dtype_from_pylibcudf_column(plc_Column col not None):
+    type_ = col.type()
+    tid = type_.id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        child = col.list_view().child()
+        return cudf.ListDtype(dtype_from_pylibcudf_column(child))
+    elif tid == pylibcudf.TypeId.STRUCT:
+        fields = {
+            str(i): dtype_from_pylibcudf_column(col.child(i))
+            for i in range(col.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == pylibcudf.TypeId.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
+
+
+cdef dtype_from_lists_column_view(column_view cv):
+    # lists_column_view have no default constructor, so we heap
+    # allocate it to get around Cython's limitation of requiring
+    # default constructors for stack allocated objects
+    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
+    cdef column_view child = lv.get()[0].child()
+
+    if child.type().id() == libcudf_types.type_id.LIST:
+        return cudf.ListDtype(dtype_from_lists_column_view(child))
+    else:
+        return cudf.ListDtype(dtype_from_column_view(child))
+
+
+cdef dtype_from_column_view(column_view cv):
+    cdef libcudf_types.type_id tid = cv.type().id()
+    if tid == libcudf_types.type_id.LIST:
+        return dtype_from_lists_column_view(cv)
+    elif tid == libcudf_types.type_id.STRUCT:
+        fields = {
+            str(i): dtype_from_column_view(cv.child(i))
+            for i in range(cv.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == libcudf_types.type_id.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[<int32_t>(tid)]
+
+
 cdef class Column:
     """
     A Column stores columnar data in device memory.
@@ -361,7 +433,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -424,7 +496,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index fd6d0257940..65607c91302 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import copy
 
@@ -14,17 +14,16 @@ import pylibcudf as plc
 
 import cudf
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
+from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
-from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
+from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 def _replace_nested(obj, check, replacement):
@@ -223,40 +222,22 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef plc_TypeID cdtype_id = self.c_value.type().id()
+        cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
         elif cdtype_id in {
-            plc_TypeID.DECIMAL32,
-            plc_TypeID.DECIMAL64,
-            plc_TypeID.DECIMAL128,
+            plc.TypeID.DECIMAL32,
+            plc.TypeID.DECIMAL64,
+            plc.TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype_id == plc_TypeID.STRUCT:
-            struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
-            self._dtype = StructDtype({
-                str(i): dtype_from_column_view(struct_table_view.column(i))
-                for i in range(struct_table_view.num_columns())
-            })
-        elif cdtype_id == plc_TypeID.LIST:
-            if (
-                <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == plc_TypeID.LIST:
-                self._dtype = dtype_from_column_view(
-                    (<list_scalar*>self.get_raw_ptr())[0].view()
-                )
-            else:
-                self._dtype = ListDtype(
-                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                        <underlying_type_t_type_id>(
-                            (<list_scalar*>self.get_raw_ptr())[0]
-                            .view().type().id()
-                        )
-                    ]
-                )
+        elif cdtype_id == plc.TypeID.STRUCT:
+            self._dtype = StructDtype.from_arrow(
+                plc.interop.to_arrow(self.c_value).type
+            )
+        elif cdtype_id == plc.TypeID.LIST:
+            self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type)
         else:
-            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype_id)
-            ]
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id]
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
deleted file mode 100644
index 18b1d26e4db..00000000000
--- a/python/cudf/cudf/_lib/types.pxd
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-
-ctypedef int32_t underlying_type_t_type_id
-
-cdef dtype_from_column_view(column_view cv)
-
-cpdef dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
deleted file mode 100644
index 777bd070b32..00000000000
--- a/python/cudf/cudf/_lib/types.pyx
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-
-from libcpp.memory cimport make_shared, shared_ptr
-
-cimport pylibcudf.libcudf.types as libcudf_types
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-
-import pylibcudf as plc
-
-import cudf
-
-
-SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
-    np.dtype("int8"): plc.types.TypeId.INT8,
-    np.dtype("int16"): plc.types.TypeId.INT16,
-    np.dtype("int32"): plc.types.TypeId.INT32,
-    np.dtype("int64"): plc.types.TypeId.INT64,
-    np.dtype("uint8"): plc.types.TypeId.UINT8,
-    np.dtype("uint16"): plc.types.TypeId.UINT16,
-    np.dtype("uint32"): plc.types.TypeId.UINT32,
-    np.dtype("uint64"): plc.types.TypeId.UINT64,
-    np.dtype("float32"): plc.types.TypeId.FLOAT32,
-    np.dtype("float64"): plc.types.TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): plc.types.TypeId.STRING,
-    np.dtype("bool"): plc.types.TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
-}
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    plc_type: np_type
-    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
-}
-# There's no equivalent to EMPTY in cudf.  We translate EMPTY
-# columns from libcudf to ``int8`` columns of all nulls in Python.
-# ``int8`` is chosen because it uses the least amount of memory.
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
-
-
-size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
-
-
-cdef dtype_from_lists_column_view(column_view cv):
-    # lists_column_view have no default constructor, so we heap
-    # allocate it to get around Cython's limitation of requiring
-    # default constructors for stack allocated objects
-    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
-    cdef column_view child = lv.get()[0].child()
-
-    if child.type().id() == libcudf_types.type_id.LIST:
-        return cudf.ListDtype(dtype_from_lists_column_view(child))
-    elif child.type().id() == libcudf_types.type_id.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_column_view(child)
-        )
-
-cdef dtype_from_structs_column_view(column_view cv):
-    fields = {
-        str(i): dtype_from_column_view(cv.child(i))
-        for i in range(cv.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-cdef dtype_from_column_view(column_view cv):
-    cdef libcudf_types.type_id tid = cv.type().id()
-    if tid == libcudf_types.type_id.LIST:
-        return dtype_from_lists_column_view(cv)
-    elif tid == libcudf_types.type_id.STRUCT:
-        return dtype_from_structs_column_view(cv)
-    elif tid == libcudf_types.type_id.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-            <underlying_type_t_type_id>(tid)
-        ]
-
-
-cpdef dtype_to_pylibcudf_type(dtype):
-    if isinstance(dtype, cudf.ListDtype):
-        return plc.DataType(plc.TypeId.LIST)
-    elif isinstance(dtype, cudf.StructDtype):
-        return plc.DataType(plc.TypeId.STRUCT)
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = plc.TypeId.DECIMAL128
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = plc.TypeId.DECIMAL64
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = plc.TypeId.DECIMAL32
-        return plc.DataType(tid, -dtype.scale)
-    # libcudf types don't support timezones so convert to the base type
-    elif isinstance(dtype, pd.DatetimeTZDtype):
-        dtype = np.dtype(f"<M8[{dtype.unit}]")
-    else:
-        dtype = np.dtype(dtype)
-    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
-
-
-def dtype_from_pylibcudf_lists_column(col):
-    child = col.list_view().child()
-    tid = child.type().id()
-
-    if tid == plc.TypeId.LIST:
-        return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
-    elif tid == plc.TypeId.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_pylibcudf_column(child)
-        )
-
-
-def dtype_from_pylibcudf_structs_column(col):
-    fields = {
-        str(i): dtype_from_pylibcudf_column(col.child(i))
-        for i in range(col.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-
-def dtype_from_pylibcudf_column(col):
-    type_ = col.type()
-    tid = type_.id()
-
-    if tid == plc.TypeId.LIST:
-        return dtype_from_pylibcudf_lists_column(col)
-    elif tid == plc.TypeId.STRUCT:
-        return dtype_from_pylibcudf_structs_column(col)
-    elif tid == plc.TypeId.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2806a1f6c23..5f90439f86f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -10,7 +10,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core._internals import copying
@@ -24,7 +23,11 @@
 from cudf.core.copy_types import GatherMap
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
-from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    is_mixed_with_object_dtype,
+)
 from cudf.utils.utils import _is_same_name
 
 if TYPE_CHECKING:
@@ -2047,7 +2050,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if gather_map.dtype.kind not in "iu":
-            gather_map = gather_map.astype(size_type_dtype)
+            gather_map = gather_map.astype(SIZE_TYPE_DTYPE)
 
         GatherMap(gather_map, len(self), nullify=not check_bounds or nullify)
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
index 1d21d34b1bf..e6e6c3bcedf 100644
--- a/python/cudf/cudf/core/_internals/aggregation.py
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Literal
@@ -8,9 +8,9 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf.api.types import is_scalar
 from cudf.utils import cudautils
+from cudf.utils.dtypes import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 
 if TYPE_CHECKING:
     from collections.abc import Callable
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
index 212150f505e..a9023f8fd59 100644
--- a/python/cudf/cudf/core/_internals/binaryop.py
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -6,8 +6,8 @@
 import pylibcudf as plc
 
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
index 3b8e3db60a7..c45c4a1b5cf 100644
--- a/python/cudf/cudf/core/_internals/unary.py
+++ b/python/cudf/cudf/core/_internals/unary.py
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
 import pylibcudf as plc
 
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_decimal_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d705b4d4c21..b9d6c0e7f08 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -12,12 +12,12 @@
 from typing_extensions import Self
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     find_common_type,
     is_mixed_with_object_dtype,
     min_signed_type,
@@ -1140,7 +1140,7 @@ def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
             return self.codes
-        gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0)
+        gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0)
         out = self.categories.take(gather_map)
         out = out.set_mask(self.mask)
         return out
@@ -1192,10 +1192,10 @@ def _concat(
         codes = [o.codes for o in objs]
 
         newsize = sum(map(len, codes))
-        if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+        if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
             raise MemoryError(
                 f"Result of concat cannot have "
-                f"size > {libcudf.types.size_type_dtype}_MAX"
+                f"size > {SIZE_TYPE_DTYPE}_MAX"
             )
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e23ca810065..30da8727366 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -25,7 +25,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -60,9 +59,11 @@
 from cudf.core.mixins import BinaryOperand, Reducible
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
     cudf_dtype_to_pa_type,
+    dtype_to_pylibcudf_type,
     find_common_type,
     get_time_unit,
     is_column_like,
@@ -874,7 +875,7 @@ def indices_of(
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
         return apply_boolean_mask(  # type: ignore[return-value]
-            [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
+            [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
         )[0]
 
     def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
@@ -954,7 +955,7 @@ def take(
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if indices.dtype.kind not in {"u", "i"}:
-            indices = indices.astype(libcudf.types.size_type_dtype)
+            indices = indices.astype(SIZE_TYPE_DTYPE)
         GatherMap(indices, len(self), nullify=not check_bounds or nullify)
         gathered = copying.gather([self], indices, nullify=nullify)  # type: ignore[arg-type]
         return gathered[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
@@ -1743,9 +1744,7 @@ def column_empty(
     elif isinstance(dtype, ListDtype):
         data = None
         children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
+            as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),
             column_empty(row_count, dtype=dtype.element_type),
         )
     elif isinstance(dtype, CategoricalDtype):
@@ -1754,21 +1753,16 @@ def column_empty(
             cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
-                        size=row_count
-                        * cudf.dtype(libcudf.types.size_type_dtype).itemsize
+                        size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize
                     )
                 ),
                 size=None,
-                dtype=libcudf.types.size_type_dtype,
+                dtype=SIZE_TYPE_DTYPE,
             ),
         )
     elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
         data = as_buffer(rmm.DeviceBuffer(size=0))
-        children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
-        )
+        children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),)
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
@@ -2552,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         )
 
     newsize = sum(map(len, objs))
-    if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+    if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
         raise MemoryError(
-            f"Result of concat cannot have "
-            f"size > {libcudf.types.size_type_dtype}_MAX"
+            f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX"
         )
     elif newsize == 0:
         return column_empty(0, head.dtype)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 6fc2b5d4ca2..04b4003c510 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
@@ -22,6 +21,7 @@
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -258,7 +258,7 @@ def from_sequences(
 
         offset_col = cast(
             NumericalColumn,
-            column.as_column(offset_vals, dtype=size_type_dtype),
+            column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE),
         )
 
         # Build ListColumn
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 20eded9a27f..2bee85cb387 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -19,16 +19,18 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import can_convert_to_column
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
@@ -5611,7 +5613,7 @@ def __init__(
         if len(children) == 0 and size != 0:
             # all nulls-column:
             offsets = column.as_column(
-                0, length=size + 1, dtype=size_type_dtype
+                0, length=size + 1, dtype=SIZE_TYPE_DTYPE
             )
 
             children = (offsets,)
@@ -5888,7 +5890,7 @@ def as_decimal_column(
     ) -> cudf.core.column.DecimalBaseColumn:
         plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
             self.to_pylibcudf(mode="read"),
-            libcudf.types.dtype_to_pylibcudf_type(dtype),
+            dtype_to_pylibcudf_type(dtype),
         )
         result = Column.from_pylibcudf(plc_column)
         result.dtype.precision = dtype.precision  # type: ignore[union-attr]
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 4b6ad59c8e1..aaaf6c7ee4f 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, cast
 
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from cudf.core.column import NumericalColumn
@@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             # Alternately we can have an Optional[Column] and handle None
             # specially in _gather.
             self.column = cast(
-                "NumericalColumn", self.column.astype(size_type_dtype)
+                "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE)
             )
         else:
             if self.column.dtype.kind not in {"i", "u"}:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 8ed233ba737..ce7fb968069 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import decimal
@@ -57,7 +57,8 @@ def dtype(arbitrary):
         if np_dtype.kind in set("OU"):
             return np.dtype("object")
         elif (
-            np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+            np_dtype
+            not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
         ):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 17302311a7e..7bc4b08fc49 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -21,7 +21,6 @@
 import cudf
 import cudf.core._internals
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     is_list_like,
@@ -46,7 +45,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.dtypes import cudf_dtype_to_pa_type
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
@@ -588,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         offsets, group_keys, (indices,) = self._groups(
             [
                 cudf.core.column.as_column(
-                    range(len(self.obj)), dtype=size_type_dtype
+                    range(len(self.obj)), dtype=SIZE_TYPE_DTYPE
                 )
             ]
         )
@@ -1185,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         # aggregation scheme in libcudf. This is probably "fast
         # enough" for most reasonable input sizes.
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         # "Out of bounds" n for the group size either means no entries
         # (negative) or all the entries (positive)
@@ -1199,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             group_offsets = group_offsets[:-1]
         else:
             group_offsets = group_offsets[1:] - size_per_group
-        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
+        to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
         fixup = np.empty_like(size_per_group)
         fixup[0] = 0
         np.cumsum(size_per_group[:-1], out=fixup[1:])
@@ -1500,11 +1499,11 @@ def sample(
         # into a numpy array directly, rather than a list.
         # TODO: this uses the sort-based groupby, could one use hash-based?
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         if n is not None:
             samples_per_group = np.broadcast_to(
-                size_type_dtype.type(n), size_per_group.shape
+                SIZE_TYPE_DTYPE.type(n), size_per_group.shape
             )
             if not replace and (minsize := size_per_group.min()) < n:
                 raise ValueError(
@@ -1517,7 +1516,7 @@ def sample(
             # which is round-to-nearest, ties to sgn(x) * inf).
             samples_per_group = np.round(
                 size_per_group * frac, decimals=0
-            ).astype(size_type_dtype)
+            ).astype(SIZE_TYPE_DTYPE)
         if replace:
             # We would prefer to use cupy here, but their rng.integers
             # interface doesn't take array-based low and high
@@ -1525,7 +1524,7 @@ def sample(
             low = 0
             high = np.repeat(size_per_group, samples_per_group)
             rng = np.random.default_rng(seed=random_state)
-            indices = rng.integers(low, high, dtype=size_type_dtype)
+            indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE)
             indices += np.repeat(group_offsets[:-1], samples_per_group)
         else:
             # Approach: do a segmented argsort of the index array and take
@@ -1533,7 +1532,7 @@ def sample(
             # We will shuffle the group indices and then pick them out
             # from the grouped dataframe index.
             nrows = len(group_values)
-            indices = cp.arange(nrows, dtype=size_type_dtype)
+            indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE)
             if len(size_per_group) < 500:
                 # Empirically shuffling with cupy is faster at this scale
                 rs = cp.random.get_random_state()
@@ -1557,7 +1556,7 @@ def sample(
                     indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
-            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
+            want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
             scan = np.empty_like(samples_per_group)
             scan[0] = 0
             np.cumsum(samples_per_group[:-1], out=scan[1:])
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b535e8aabd2..0d1bf552982 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -19,7 +19,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -53,6 +52,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = [self._range.index(value)]
         except ValueError:
             i = []
-        return as_column(i, dtype=size_type_dtype)
+        return as_column(i, dtype=SIZE_TYPE_DTYPE)
 
     def isin(self, values, level=None):
         if level is not None and level > 0:
@@ -1348,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = as_column(
             -1,
             length=len(needle),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
 
         if not len(self):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index eded681baf0..4c6f8a9c152 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -60,6 +60,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
@@ -3034,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                         NumericalColumn,
                         as_column(
                             range(start, stop, stride),
-                            dtype=libcudf.types.size_type_dtype,
+                            dtype=SIZE_TYPE_DTYPE,
                         ),
                     ),
                     len(self),
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6e965ceca66..ce7edc8fdbe 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import Any
@@ -7,7 +7,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
@@ -17,6 +16,7 @@
     _IndexIndexer,
     _match_join_keys,
 )
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 
 class Merge:
@@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols):
         # tables, we gather from iota on both right and left, and then
         # sort the gather maps with those two columns as key.
         key_order = [
-            cudf.core.column.as_column(range(n), dtype=size_type_dtype).take(
+            cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take(
                 map_, nullify=null, check_bounds=False
             )
             for map_, n, null in zip(maps, lengths, nullify)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e7efd01ca85..64ec099cb39 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -17,7 +17,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
@@ -34,7 +33,7 @@
     ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
-from cudf.utils.dtypes import is_column_like
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -199,7 +198,7 @@ def __init__(
                     )
                 if lo == -1:
                     # Now we can gather and insert null automatically
-                    code[code == -1] = np.iinfo(size_type_dtype).min
+                    code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min
             result_col = level._column.take(code, nullify=True)
             source_data[i] = result_col._with_type_metadata(level.dtype)
 
@@ -1578,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index:
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
-        # cudf uses np.iinfo(size_type_dtype).min as missing code
+        # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code
         # pandas uses -1 as missing code
         pd_codes = (
             code.find_and_replace(
-                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1),
                 column.as_column(-1, length=1),
             )
             for code in self._codes
@@ -1903,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = column.as_column(
             -1,
             length=len(target),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
         if not len(self):
             return _return_get_indexer_result(result.values)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 0abd42d4d4e..eedd777aafe 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -12,13 +12,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.utils.dtypes import min_unsigned_type
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
@@ -1333,10 +1332,10 @@ def _one_hot_encode_column(
         else:
             column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
-    if column.size * categories.size >= np.iinfo(size_type_dtype).max:
+    if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max:
         raise ValueError(
             "Size limitation exceeded: column.size * category.size < "
-            f"np.iinfo({size_type_dtype}).max. Consider reducing "
+            f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing "
             "size of category"
         )
     result_labels = (
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 6d617cbf38e..7e8468c8e8a 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import errno
@@ -16,11 +16,13 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_hashable, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 
 _CSV_HEX_TYPE_MAP = {
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index ff326e09315..16c7d189dfd 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import os
@@ -14,10 +14,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index f3124552fd1..0ac2950a22b 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -11,11 +11,11 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_list_like
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.index import _index_from_data
 from cudf.utils import ioutils
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 try:
     import ujson as json  # type: ignore[import-untyped]
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc
new file mode 100644
index 00000000000..8772f84c3ba
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc
new file mode 100644
index 00000000000..f5a1edbb10e
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc differ
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index fe143e66407..933cf8849ec 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1975,8 +1975,14 @@ def test_row_group_alignment(datadir):
 @pytest.mark.parametrize(
     "inputfile",
     [
+        # These sample data have a single column my_timestamp of the TIMESTAMP type,
+        # 2660 rows, and 1536 rows per row group.
         "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
         "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+        # These two data are the same with the above, except that every 100 rows start
+        # with a null value.
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc",
     ],
 )
 def test_orc_reader_desynced_timestamp(datadir, inputfile):
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 31a8f4de3b3..9e932acb5fa 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import datetime
@@ -11,6 +11,8 @@
 import pyarrow as pa
 from pandas.core.dtypes.common import infer_dtype_from_object
 
+import pylibcudf as plc
+
 import cudf
 
 if TYPE_CHECKING:
@@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif cudf.api.types.is_decimal128_dtype(dtype):
         return cudf.core.dtypes.Decimal128Dtype
-    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
+    elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)
@@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
         return dtype.base
 
 
+def dtype_to_pylibcudf_type(dtype) -> plc.DataType:
+    if isinstance(dtype, cudf.ListDtype):
+        return plc.DataType(plc.TypeId.LIST)
+    elif isinstance(dtype, cudf.StructDtype):
+        return plc.DataType(plc.TypeId.STRUCT)
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        tid = plc.TypeId.DECIMAL128
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal64Dtype):
+        tid = plc.TypeId.DECIMAL64
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        tid = plc.TypeId.DECIMAL32
+        return plc.DataType(tid, -dtype.scale)
+    # libcudf types don't support timezones so convert to the base type
+    elif isinstance(dtype, pd.DatetimeTZDtype):
+        dtype = _get_base_dtype(dtype)
+    else:
+        dtype = np.dtype(dtype)
+    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
+
+
+SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
+    np.dtype("int8"): plc.types.TypeId.INT8,
+    np.dtype("int16"): plc.types.TypeId.INT16,
+    np.dtype("int32"): plc.types.TypeId.INT32,
+    np.dtype("int64"): plc.types.TypeId.INT64,
+    np.dtype("uint8"): plc.types.TypeId.UINT8,
+    np.dtype("uint16"): plc.types.TypeId.UINT16,
+    np.dtype("uint32"): plc.types.TypeId.UINT32,
+    np.dtype("uint64"): plc.types.TypeId.UINT64,
+    np.dtype("float32"): plc.types.TypeId.FLOAT32,
+    np.dtype("float64"): plc.types.TypeId.FLOAT64,
+    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
+    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
+    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
+    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
+    np.dtype("object"): plc.types.TypeId.STRING,
+    np.dtype("bool"): plc.types.TypeId.BOOL8,
+    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
+    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
+    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
+    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
+}
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
+    plc_type: np_type
+    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
+}
+# There's no equivalent to EMPTY in cudf.  We translate EMPTY
+# columns from libcudf to ``int8`` columns of all nulls in Python.
+# ``int8`` is chosen because it uses the least amount of memory.
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype(
+    "object"
+)
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
+
+
+SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
+
 # Type dispatch loops similar to what are found in `np.add.types`
 # In NumPy, whether or not an op can be performed between two
 # operands is determined by checking to see if NumPy has a c/c++