diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9dabe4e8800..354560998c5 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -336,6 +336,7 @@ add_library(
   src/aggregation/result_cache.cpp
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
+  src/ast/operators.cpp
   src/binaryop/binaryop.cpp
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/Add.cu
@@ -477,13 +478,13 @@ add_library(
   src/io/avro/reader_impl.cu
   src/io/comp/brotli_dict.cpp
   src/io/comp/comp.cpp
+  src/io/comp/comp.cu
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
   src/io/comp/nvcomp_adapter.cpp
   src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
-  src/io/comp/statistics.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
   src/io/csv/csv_gpu.cu
@@ -515,6 +516,7 @@ add_library(
   src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
   src/io/parquet/arrow_schema_writer.cpp
+  src/io/parquet/bloom_filter_reader.cu
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a1b7db5e08a..2b2a660bed7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -601,7 +601,7 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            data_type output_type);
 
 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
-struct host_udf_base;
+class host_udf_base;
 
 /**
  * @brief Factory to create a HOST_UDF aggregation.
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
index bbce76dc5f3..451d75137e4 100644
--- a/cpp/include/cudf/aggregation/host_udf.hpp
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,16 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <functional>
 #include <optional>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
 
 /**
  * @file host_udf.hpp
@@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief The interface for host-based UDF implementation.
+ * @brief The fundamental interface for host-based UDF implementation.
  *
- * An implementation of host-based UDF needs to be derived from this base class, defining
- * its own version of the required functions. In particular:
- *  - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
- *    and `clone` functions.
- *  - If necessary, the derived class can also override `do_hash` to compute hashing for its
- *    instance, and `get_required_data` to selectively access to the input data as well as
- *    intermediate data provided by libcudf.
+ * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in
+ * the users' UDF implementation. These functions are required for libcudf aggregation framework
+ * to perform its operations.
+ */
+class host_udf_base {
+  // Declare constructor private to prevent the users from deriving from this class.
+ private:
+  host_udf_base() = default;  ///< Default constructor
+
+  // Only allow deriving from the structs below.
+  friend struct reduce_host_udf;
+  friend struct segmented_reduce_host_udf;
+  friend struct groupby_host_udf;
+
+ public:
+  virtual ~host_udf_base() = default;  ///< Default destructor
+
+  /**
+   * @brief Computes hash value of the instance.
+   *
+   * Overriding this function is optional when the derived class has data members such that
+   * each instance needs to be differentiated from each other.
+   *
+   * @return The hash value of the instance
+   */
+  [[nodiscard]] virtual std::size_t do_hash() const
+  {
+    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+  }
+
+  /**
+   * @brief Compares two instances of the derived class for equality.
+   * @param other The other instance to compare with
+   * @return True if the two instances are equal
+   */
+  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+  /**
+   * @brief Clones the instance.
+   *
+   * The instances of the derived class should be lightweight for efficient cloning.
+   *
+   * @return A new instance cloned from this one
+   */
+  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for reduction contexts.
+ *
+ * An implementation of host-based UDF for reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived classes must also define the `operator()` function to perform reduction
+ * operations.
  *
- * Example of such implementation:
+ * Example:
  * @code{.cpp}
- * struct my_udf_aggregation : cudf::host_udf_base {
+ * struct my_udf_aggregation : cudf::reduce_host_udf {
  *   my_udf_aggregation() = default;
  *
- *   // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
- *   // and the result from groupby `MAX` aggregation.
- *   [[nodiscard]] data_attribute_set_t get_required_data() const override
+ *   [[nodiscard]] std::unique_ptr<scalar> operator()(
+ *     column_view const& input,
+ *     data_type output_dtype,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
  *   {
- *       return {groupby_data_attribute::GROUPED_VALUES,
- *               groupby_data_attribute::GROUP_OFFSETS,
- *               cudf::make_max_aggregation<cudf::groupby_aggregation>()};
+ *     // Perform reduction computation using the input data and return the reduction result.
+ *     // This is where the actual reduction logic is implemented.
  *   }
  *
- *   [[nodiscard]] output_t get_empty_output(
- *     [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
- *     [[maybe_unused]] rmm::cuda_stream_view stream,
- *     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
- *     // This UDF aggregation always returns a column of type INT32.
- *     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
- *   [[nodiscard]] output_t operator()(input_map_t const& input,
- *                                     rmm::cuda_stream_view stream,
- *                                     rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
  *   {
- *     // Perform UDF computation using the input data and return the result.
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct reduce_host_udf : host_udf_base {
+  /**
+   * @brief Perform reduction operations.
+   *
+   * @param input The input column for reduction
+   * @param output_dtype The data type for the final output scalar
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
+   */
+  [[nodiscard]] virtual std::unique_ptr<scalar> operator()(
+    column_view const& input,
+    data_type output_dtype,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for segmented reduction context.
+ *
+ * An implementation of host-based UDF for segmented reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived class must also define the `operator()` function to perform segmented reduction.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::segmented_reduce_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     column_view const& input,
+ *     device_span<size_type const> offsets,
+ *     data_type output_dtype,
+ *     null_policy null_handling,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform computation using the input data and return the result.
+ *     // This is where the actual segmented reduction logic is implemented.
  *   }
  *
  *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
  *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
  *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
@@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf {
  * };
  * @endcode
  */
-struct host_udf_base {
-  host_udf_base()          = default;
-  virtual ~host_udf_base() = default;
-
+struct segmented_reduce_host_udf : host_udf_base {
   /**
-   * @brief Define the possible data needed for groupby aggregations.
+   * @brief Perform segmented reduction operations.
    *
-   * Note that only sort-based groupby aggregations are supported.
+   * @param input The input column for reduction
+   * @param offsets A list of offsets defining the segments for reduction
+   * @param output_dtype The data type for the final output column
+   * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in
+   *        the segment are valid, and if `EXCLUDE` then the reduction result is valid if any
+   *        element in the segment is valid
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  enum class groupby_data_attribute : int32_t {
-    INPUT_VALUES,    ///< The input values column.
-    GROUPED_VALUES,  ///< The input values grouped according to the input `keys` for which the
-                     ///< values within each group maintain their original order.
-    SORTED_GROUPED_VALUES,  ///< The input values grouped according to the input `keys` and
-                            ///< sorted within each group.
-    NUM_GROUPS,             ///< The number of groups (i.e., number of distinct keys).
-    GROUP_OFFSETS,          ///< The offsets separating groups.
-    GROUP_LABELS            ///< Group labels (which is also the same as group indices).
-  };
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    column_view const& input,
+    device_span<size_type const> offsets,
+    data_type output_dtype,
+    null_policy null_handling,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
 
+// Forward declaration.
+namespace groupby ::detail {
+struct aggregate_result_functor;
+}
+
+/**
+ * @brief The interface for host-based UDF implementation for groupby aggregation context.
+ *
+ * An implementation of host-based UDF for groupby needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such a derived class must also define the functions `get_empty_output()` to return result when
+ * the input is empty, and ``operator()`` to perform its groupby operations.
+ *
+ * During execution, the derived class can access internal data provided by the libcudf groupby
+ * framework through a set of ``get*`` accessors, as well as calling other built-in groupby
+ * aggregations through the ``compute_aggregation`` function.
+ *
+ * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby
+ * aggregations require more complex data structure and is not yet supported.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::groupby_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> get_empty_output(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Return a column corresponding to the result when the input values column is empty.
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform UDF computation using the input data and return the result.
+ *   }
+ *
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ *   {
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+ *   {
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct groupby_host_udf : host_udf_base {
   /**
-   * @brief Describe possible data that may be needed in the derived class for its operations.
+   * @brief Get the output when the input values column is empty.
    *
-   * Such data can be either intermediate data such as sorted values or group labels etc, or the
-   * results of other aggregations.
+   * This is called in libcudf when the input values column is empty. In such situations libcudf
+   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
    *
-   * Each derived host-based UDF class may need a different set of data. It is inefficient to
-   * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
-   * the derived class can define a subset of data that it needs and libcudf will evaluate
-   * and pass down only data requested from that set.
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation when the input values column is empty
    */
-  struct data_attribute {
-    /**
-     * @brief Hold all possible data types for the input of the aggregation in the derived class.
-     */
-    using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
-    value_type value;  ///< The actual data attribute, wrapped by this struct
-                       ///< as a wrapper is needed to define `hash` and `equal_to` functors.
-
-    data_attribute()                 = default;  ///< Default constructor
-    data_attribute(data_attribute&&) = default;  ///< Move constructor
-
-    /**
-     * @brief Construct a new data attribute from an aggregation attribute.
-     * @param value_ An aggregation attribute
-     */
-    template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
-    data_attribute(T value_) : value{value_}
-    {
-    }
-
-    /**
-     * @brief Construct a new data attribute from another aggregation request.
-     * @param value_ An aggregation request
-     */
-    template <typename T,
-              CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
-                             std::is_same_v<T, groupby_aggregation>)>
-    data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
-    {
-      CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
-                   "Invalid aggregation request.");
-      if constexpr (std::is_same_v<T, aggregation>) {
-        CUDF_EXPECTS(
-          dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
-          "Requesting results from other aggregations is only supported in groupby "
-          "aggregations.");
-      }
-    }
-
-    /**
-     * @brief Copy constructor.
-     * @param other The other data attribute to copy from
-     */
-    data_attribute(data_attribute const& other);
-
-    /**
-     * @brief Hash functor for `data_attribute`.
-     */
-    struct hash {
-      /**
-       * @brief Compute the hash value of a data attribute.
-       * @param attr The data attribute to hash
-       * @return The hash value of the data attribute
-       */
-      std::size_t operator()(data_attribute const& attr) const;
-    };  // struct hash
-
-    /**
-     * @brief Equality comparison functor for `data_attribute`.
-     */
-    struct equal_to {
-      /**
-       * @brief Check if two data attributes are equal.
-       * @param lhs The left-hand side data attribute
-       * @param rhs The right-hand side data attribute
-       * @return True if the two data attributes are equal
-       */
-      bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
-    };  // struct equal_to
-  };    // struct data_attribute
+  [[nodiscard]] virtual std::unique_ptr<column> get_empty_output(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
 
   /**
-   * @brief Set of attributes for the input data that is needed for computing the aggregation.
+   * @brief Perform the main groupby computation for the host-based UDF.
+   *
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  using data_attribute_set_t =
-    std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
+
+ private:
+  // Allow the struct `aggregate_result_functor` to set its private callback variables.
+  friend struct groupby::detail::aggregate_result_functor;
 
   /**
-   * @brief Return a set of attributes for the data that is needed for computing the aggregation.
-   *
-   * The derived class should return the attributes corresponding to only the data that it needs to
-   * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
-   * empty set is returned. That means all the data attributes (except results from other
-   * aggregations in groupby) will be needed.
-   *
-   * @return A set of `data_attribute`
+   * @brief Callback to access the input values column.
+   */
+  std::function<column_view(void)> callback_input_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys for which the
+   * values within each group maintain their original order.
+   */
+  std::function<column_view(void)> callback_grouped_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys and sorted
+   * within each group.
+   */
+  std::function<column_view(void)> callback_sorted_grouped_values;
+
+  /**
+   * @brief Callback to access the number of groups (i.e., number of distinct keys).
    */
-  [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+  std::function<size_type(void)> callback_num_groups;
 
   /**
-   * @brief Hold all possible types of the data that is passed to the derived class for executing
-   * the aggregation.
+   * @brief Callback to access the offsets separating groups.
    */
-  using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
+  std::function<device_span<size_type const>(void)> callback_group_offsets;
 
   /**
-   * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+   * @brief Callback to access the group labels (which is also the same as group indices).
    */
-  using input_map_t = std::
-    unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
+  std::function<device_span<size_type const>(void)> callback_group_labels;
 
   /**
-   * @brief Output type of the aggregation.
+   * @brief Callback to access the result from other groupby aggregations.
+   */
+  std::function<column_view(std::unique_ptr<aggregation>)> callback_compute_aggregation;
+
+ protected:
+  /**
+   * @brief Access the input values column.
    *
-   * Currently only a single type is supported as the output of the aggregation, but it will hold
-   * more type in the future when reduction is supported.
+   * @return The input values column.
    */
-  using output_t = std::variant<std::unique_ptr<column>>;
+  [[nodiscard]] column_view get_input_values() const
+  {
+    CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values.");
+    return callback_input_values();
+  }
 
   /**
-   * @brief Get the output when the input values column is empty.
+   * @brief Access the input values grouped according to the input keys for which the values
+   * within each group maintain their original order.
    *
-   * This is called in libcudf when the input values column is empty. In such situations libcudf
-   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+   * @return The grouped values column.
+   */
+  [[nodiscard]] column_view get_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values.");
+    return callback_grouped_values();
+  }
+
+  /**
+   * @brief Access the input values grouped according to the input keys and sorted within each
+   * group.
    *
-   * @param output_dtype The expected output data type
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation when input values is empty
+   * @return The sorted grouped values column.
    */
-  [[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] column_view get_sorted_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values.");
+    return callback_sorted_grouped_values();
+  }
 
   /**
-   * @brief Perform the main computation for the host-based UDF.
+   * @brief Access the number of groups (i.e., number of distinct keys).
    *
-   * @param input The input data needed for performing all computation
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation
+   * @return The number of groups.
    */
-  [[nodiscard]] virtual output_t operator()(input_map_t const& input,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] size_type get_num_groups() const
+  {
+    CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups.");
+    return callback_num_groups();
+  }
 
   /**
-   * @brief Computes hash value of the class's instance.
-   * @return The hash value of the instance
+   * @brief Access the offsets separating groups.
+   *
+   * @return The array of group offsets.
    */
-  [[nodiscard]] virtual std::size_t do_hash() const
+  [[nodiscard]] device_span<size_type const> get_group_offsets() const
   {
-    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+    CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets.");
+    return callback_group_offsets();
   }
 
   /**
-   * @brief Compares two instances of the derived class for equality.
-   * @param other The other derived class's instance to compare with
-   * @return True if the two instances are equal
+   * @brief Access the group labels (which is also the same as group indices).
+   *
+   * @return The array of group labels.
    */
-  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+  [[nodiscard]] device_span<size_type const> get_group_labels() const
+  {
+    CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels.");
+    return callback_group_labels();
+  }
 
   /**
-   * @brief Clones the instance.
+   * @brief Compute a built-in groupby aggregation and access its result.
    *
-   * A class derived from `host_udf_base` should not store too much data such that its instances
-   * remain lightweight for efficient cloning.
+   * This allows the derived class to call any other built-in groupby aggregations on the same input
+   * values column and access the output for its operations.
    *
-   * @return A new instance cloned from this
+   * @param other_agg An arbitrary built-in groupby aggregation
+   * @return A `column_view` object corresponding to the output result of the given aggregation
    */
-  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+  [[nodiscard]] column_view compute_aggregation(std::unique_ptr<aggregation> other_agg) const
+  {
+    CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation.");
+    return callback_compute_aggregation(std::move(other_agg));
+  }
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 9d8762555d7..001b604814c 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -452,7 +452,7 @@ struct expression_evaluator {
          ++operator_index) {
       // Execute operator
       auto const op    = plan.operators[operator_index];
-      auto const arity = ast_operator_arity(op);
+      auto const arity = plan.operator_arities[operator_index];
       if (arity == 1) {
         // Unary operator
         auto const& input =
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index b5973d0ace9..d2e8c1cd41f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -88,6 +89,7 @@ struct expression_device_view {
   device_span<detail::device_data_reference const> data_references;
   device_span<generic_scalar_device_view const> literals;
   device_span<ast_operator const> operators;
+  device_span<cudf::size_type const> operator_arities;
   device_span<cudf::size_type const> operator_source_indices;
   cudf::size_type num_intermediates;
 };
@@ -229,39 +231,55 @@ class expression_parser {
    * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
    * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
    * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   * @param[in,out]  alignment  The maximum alignment needed for all the extracted size and pointers
    */
   template <typename T>
   void extract_size_and_pointer(std::vector<T> const& v,
                                 std::vector<cudf::size_type>& sizes,
-                                std::vector<void const*>& data_pointers)
+                                std::vector<void const*>& data_pointers,
+                                cudf::size_type& alignment)
   {
+    // sub-type alignment will only work provided the alignment is lesser or equal to
+    // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers
+    static_assert(alignof(T) <= alignof(max_align_t));
     auto const data_size = sizeof(T) * v.size();
     sizes.push_back(data_size);
     data_pointers.push_back(v.data());
+    alignment = std::max(alignment, static_cast<cudf::size_type>(alignof(T)));
   }
 
   void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
+    // use a minimum of 4-byte alignment
+    cudf::size_type buffer_alignment = 4;
 
-    extract_size_and_pointer(_data_references, sizes, data_pointers);
-    extract_size_and_pointer(_literals, sizes, data_pointers);
-    extract_size_and_pointer(_operators, sizes, data_pointers);
-    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers);
+    extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment);
 
     // Create device buffer
-    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
-    auto buffer_offsets    = std::vector<int>(sizes.size());
-    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+    auto buffer_offsets = std::vector<cudf::size_type>(sizes.size());
+    thrust::exclusive_scan(sizes.cbegin(),
+                           sizes.cend(),
+                           buffer_offsets.begin(),
+                           cudf::size_type{0},
+                           [buffer_alignment](auto a, auto b) {
+                             // align each component of the AST program
+                             return cudf::util::round_up_safe(a + b, buffer_alignment);
+                           });
+
+    auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back());
+    auto host_data_buffer  = std::vector<char>(buffer_size);
 
-    auto h_data_buffer = std::vector<char>(buffer_size);
     for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
+      std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
     }
 
-    _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr);
-
+    _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr);
     stream.synchronize();
 
     // Create device pointers to components of plan
@@ -277,8 +295,11 @@ class expression_parser {
     device_expression_data.operators = device_span<ast_operator const>(
       reinterpret_cast<ast_operator const*>(device_data_buffer_ptr + buffer_offsets[2]),
       _operators.size());
-    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+    device_expression_data.operator_arities = device_span<cudf::size_type const>(
       reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[3]),
+      _operators.size());
+    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+      reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[4]),
       _operator_source_indices.size());
     device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
     shmem_per_thread                         = static_cast<int>(
@@ -322,6 +343,7 @@ class expression_parser {
   bool _has_nulls;
   std::vector<detail::device_data_reference> _data_references;
   std::vector<ast_operator> _operators;
+  std::vector<cudf::size_type> _operator_arities;
   std::vector<cudf::size_type> _operator_source_indices;
   std::vector<generic_scalar_device_view> _literals;
 };
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 46507700e21..db04e1fe989 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v<Op, T>;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op,
+                                                                         F&& f,
+                                                                         Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
-      f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
     case ast_operator::SUB:
-      f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
     case ast_operator::MUL:
-      f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
     case ast_operator::DIV:
-      f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
     case ast_operator::TRUE_DIV:
-      f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR_DIV:
-      f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
     case ast_operator::MOD:
-      f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
     case ast_operator::PYMOD:
-      f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
     case ast_operator::POW:
-      f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
     case ast_operator::EQUAL:
-      f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NULL_EQUAL:
-      f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NOT_EQUAL:
-      f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::LESS:
-      f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
     case ast_operator::GREATER:
-      f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
     case ast_operator::LESS_EQUAL:
-      f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::GREATER_EQUAL:
-      f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_AND:
-      f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_OR:
-      f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_XOR:
-      f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_AND:
-      f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_AND:
-      f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_OR:
-      f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_OR:
-      f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::IDENTITY:
-      f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
     case ast_operator::IS_NULL:
-      f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
     case ast_operator::SIN:
-      f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
     case ast_operator::COS:
-      f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
     case ast_operator::TAN:
-      f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
     case ast_operator::ARCSIN:
-      f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOS:
-      f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
     case ast_operator::ARCTAN:
-      f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
     case ast_operator::SINH:
-      f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
     case ast_operator::COSH:
-      f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
     case ast_operator::TANH:
-      f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
     case ast_operator::ARCSINH:
-      f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOSH:
-      f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
     case ast_operator::ARCTANH:
-      f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
     case ast_operator::EXP:
-      f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
     case ast_operator::LOG:
-      f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
     case ast_operator::SQRT:
-      f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
     case ast_operator::CBRT:
-      f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
     case ast_operator::CEIL:
-      f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR:
-      f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
     case ast_operator::ABS:
-      f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
     case ast_operator::RINT:
-      f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
     case ast_operator::BIT_INVERT:
-      f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
     case ast_operator::NOT:
-      f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_INT64:
-      f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_UINT64:
-      f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_FLOAT64:
-      f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
@@ -955,231 +907,6 @@ struct operator_functor<ast_operator::NULL_LOGICAL_OR, true> {
   }
 };
 
-/**
- * @brief Functor used to single-type-dispatch binary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
- * the same type, and dispatches based on the type of the left input.
- *
- * @tparam OperatorFunctor Binary operator functor.
- */
-template <typename OperatorFunctor>
-struct single_dispatch_binary_operator_types {
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
-  }
-
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a binary operator.
- *
- * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
- * for all binary operators but holds for all currently implemented operators.
- */
-struct type_dispatch_binary_op {
-  /**
-   * @brief Performs type dispatch for a binary operator.
-   *
-   * @tparam op AST operator.
-   * @tparam F Type of forwarded functor.
-   * @tparam Ts Parameter pack of forwarded arguments.
-   * @param lhs_type Type of left input data.
-   * @param rhs_type Type of right input data.
-   * @param f Forwarded functor to be called.
-   * @param args Forwarded arguments to `operator()` of `f`.
-   */
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
-                                          cudf::data_type rhs_type,
-                                          F&& f,
-                                          Ts&&... args)
-  {
-    // Single dispatch (assume lhs_type == rhs_type)
-    type_dispatcher(
-      lhs_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime binary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param lhs_type Type of left input data.
- * @param rhs_type Type of right input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
-  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_binary_op{},
-                          lhs_type,
-                          rhs_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor used to type-dispatch unary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_unary_op` trait.
- *
- * @tparam OperatorFunctor Unary operator functor.
- */
-template <typename OperatorFunctor>
-struct dispatch_unary_operator_types {
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
-  }
-
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a unary operator.
- */
-struct type_dispatch_unary_op {
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
-  {
-    type_dispatcher(
-      input_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime unary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param input_type Type of input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
-                                                                 cudf::data_type input_type,
-                                                                 F&& f,
-                                                                 Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_unary_op{},
-                          input_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor to determine the return type of an operator from its input types.
- */
-struct return_type_functor {
-  /**
-   * @brief Callable for binary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam LHS Left input type.
-   * @tparam RHS Right input type.
-   * @param result Reference whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
-#endif
-  }
-
-  /**
-   * @brief Callable for unary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam T Input type.
-   * @param result Pointer whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
-#endif
-  }
-};
-
 /**
  * @brief Gets the return type of an AST operator.
  *
@@ -1187,34 +914,8 @@ struct return_type_functor {
  * @param operand_types Vector of input types to the operator.
  * @return cudf::data_type Return type of the operator.
  */
-inline cudf::data_type ast_operator_return_type(ast_operator op,
-                                                std::vector<cudf::data_type> const& operand_types)
-{
-  auto result = cudf::data_type(cudf::type_id::EMPTY);
-  switch (operand_types.size()) {
-    case 1:
-      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
-      break;
-    case 2:
-      binary_operator_dispatcher(
-        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
-      break;
-    default: CUDF_FAIL("Unsupported operator return type."); break;
-  }
-  return result;
-}
-
-/**
- * @brief Functor to determine the arity (number of operands) of an operator.
- */
-struct arity_functor {
-  template <ast_operator op>
-  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
-  {
-    // Arity is not dependent on null handling, so just use the false implementation here.
-    result = operator_functor<op, false>::arity;
-  }
-};
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types);
 
 /**
  * @brief Gets the arity (number of operands) of an AST operator.
@@ -1222,12 +923,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
-{
-  auto result = cudf::size_type(0);
-  ast_operator_dispatcher(op, detail::arity_functor{}, result);
-  return result;
-}
+cudf::size_type ast_operator_arity(ast_operator op);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d873e93bd20..5574ed6ea6e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying host-based UDF aggregation.
  */
-class host_udf_aggregation final : public groupby_aggregation {
+class host_udf_aggregation final : public groupby_aggregation,
+                                   public reduce_aggregation,
+                                   public segmented_reduce_aggregation {
  public:
   std::unique_ptr<host_udf_base> udf_ptr;
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 163fa20806d..82f7761da2e 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ class orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify frequency of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -733,7 +733,11 @@ class orc_writer_options {
    *
    * @param comp Compression type
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection.
@@ -865,7 +869,7 @@ class orc_writer_options_builder {
    */
   orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
@@ -1026,7 +1030,7 @@ class chunked_orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify granularity of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -1157,7 +1161,11 @@ class chunked_orc_writer_options {
    *
    * @param comp The compression type to use
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection
@@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder {
    */
   chunked_orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
deleted file mode 100644
index ae48fed4e79..00000000000
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <nvtext/generate_ngrams.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace CUDF_EXPORT nvtext {
-namespace detail {
-
-/**
- * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for allocating/copying device memory and launching kernels
- */
-std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
-                                                    cudf::size_type ngrams,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::device_async_resource_ref mr);
-
-}  // namespace detail
-}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 54282b8ef3c..b2ba1798a8f 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  *
  * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param seed The seed value to use with the hash algorithm. Default is 0.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
@@ -124,6 +125,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 5,
+  uint32_t seed                     = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index d0e4c59ca54..b2cc134d9fa 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,6 +161,7 @@ cudf::size_type expression_parser::visit(operation const& expr)
   auto const op        = expr.get_operator();
   auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types);
   _operators.push_back(op);
+  _operator_arities.push_back(cudf::ast::detail::ast_operator_arity(op));
   // Push data reference
   auto const output = [&]() {
     if (expression_index == 0) {
diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp
new file mode 100644
index 00000000000..b60a69a42d9
--- /dev/null
+++ b/cpp/src/ast/operators.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/type_traits>
+#include <thrust/optional.h>
+
+#include <vector>
+
+namespace cudf {
+namespace ast {
+namespace detail {
+namespace {
+
+struct arity_functor {
+  template <ast_operator op>
+  void operator()(cudf::size_type& result)
+  {
+    // Arity is not dependent on null handling, so just use the false implementation here.
+    result = operator_functor<op, false>::arity;
+  }
+};
+
+/**
+ * @brief Functor to determine the return type of an operator from its input types.
+ */
+struct return_type_functor {
+  /**
+   * @brief Callable for binary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam LHS Left input type.
+   * @tparam RHS Right input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+
+  /**
+   * @brief Callable for unary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam T Input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+};
+
+/**
+ * @brief Functor used to single-type-dispatch binary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
+ * the same type, and dispatches based on the type of the left input.
+ *
+ * @tparam OperatorFunctor Binary operator functor.
+ */
+template <typename OperatorFunctor>
+struct single_dispatch_binary_operator_types {
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
+  }
+
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a binary operator.
+ *
+ * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
+ * for all binary operators but holds for all currently implemented operators.
+ */
+struct type_dispatch_binary_op {
+  /**
+   * @brief Performs type dispatch for a binary operator.
+   *
+   * @tparam op AST operator.
+   * @tparam F Type of forwarded functor.
+   * @tparam Ts Parameter pack of forwarded arguments.
+   * @param lhs_type Type of left input data.
+   * @param rhs_type Type of right input data.
+   * @param f Forwarded functor to be called.
+   * @param args Forwarded arguments to `operator()` of `f`.
+   */
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+  {
+    // Single dispatch (assume lhs_type == rhs_type)
+    type_dispatcher(
+      lhs_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime binary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param lhs_type Type of left input data.
+ * @param rhs_type Type of right input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void binary_operator_dispatcher(
+  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_binary_op{},
+                          lhs_type,
+                          rhs_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+/**
+ * @brief Functor used to type-dispatch unary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_unary_op` trait.
+ *
+ * @tparam OperatorFunctor Unary operator functor.
+ */
+template <typename OperatorFunctor>
+struct dispatch_unary_operator_types {
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
+  }
+
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a unary operator.
+ */
+struct type_dispatch_unary_op {
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  {
+    type_dispatcher(
+      input_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime unary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param input_type Type of input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                cudf::data_type input_type,
+                                                F&& f,
+                                                Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_unary_op{},
+                          input_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+}  // namespace
+
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types)
+{
+  cudf::data_type result{cudf::type_id::EMPTY};
+  switch (operand_types.size()) {
+    case 1:
+      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
+      break;
+    case 2:
+      binary_operator_dispatcher(
+        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
+      break;
+    default: CUDF_FAIL("Unsupported operator return type."); break;
+  }
+  return result;
+}
+
+cudf::size_type ast_operator_arity(ast_operator op)
+{
+  cudf::size_type result{};
+  ast_operator_dispatcher(op, arity_functor{}, result);
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace ast
+
+}  // namespace cudf
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 4c90cd0eef5..6234148e9fa 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,8 +145,11 @@ struct empty_column_constructor {
     }
 
     if constexpr (k == aggregation::Kind::HOST_UDF) {
-      auto const& udf_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-      return std::get<std::unique_ptr<column>>(udf_ptr->get_empty_output(std::nullopt, stream, mr));
+      auto const& udf_base_ptr =
+        dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+      auto const udf_ptr = dynamic_cast<groupby_host_udf const*>(udf_base_ptr.get());
+      CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
+      return udf_ptr->get_empty_output(stream, mr);
     }
 
     return make_empty_column(target_type(values.type(), k));
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 6480070e85a..fb3f7559d64 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -795,58 +795,41 @@ void aggregate_result_functor::operator()<aggregation::HOST_UDF>(aggregation con
 {
   if (cache.has_result(values, agg)) { return; }
 
-  auto const& udf_ptr   = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-  auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t {
-    if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; }
-    // Empty attribute set means everything.
-    return {host_udf_base::groupby_data_attribute::INPUT_VALUES,
-            host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::NUM_GROUPS,
-            host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-            host_udf_base::groupby_data_attribute::GROUP_LABELS};
-  }();
+  auto const& udf_base_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+  auto const udf_ptr       = dynamic_cast<groupby_host_udf*>(udf_base_ptr.get());
+  CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
 
-  // Do not cache udf_input, as the actual input data may change from run to run.
-  host_udf_base::input_map_t udf_input;
-  for (auto const& attr : data_attrs) {
-    CUDF_EXPECTS(std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value) ||
-                   std::holds_alternative<std::unique_ptr<aggregation>>(attr.value),
-                 "Invalid input data attribute for HOST_UDF groupby aggregation.");
-    if (std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value)) {
-      switch (std::get<host_udf_base::groupby_data_attribute>(attr.value)) {
-        case host_udf_base::groupby_data_attribute::INPUT_VALUES:
-          udf_input.emplace(attr, values);
-          break;
-        case host_udf_base::groupby_data_attribute::GROUPED_VALUES:
-          udf_input.emplace(attr, get_grouped_values());
-          break;
-        case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES:
-          udf_input.emplace(attr, get_sorted_values());
-          break;
-        case host_udf_base::groupby_data_attribute::NUM_GROUPS:
-          udf_input.emplace(attr, helper.num_groups(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_OFFSETS:
-          udf_input.emplace(attr, helper.group_offsets(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_LABELS:
-          udf_input.emplace(attr, helper.group_labels(stream));
-          break;
-        default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation.");
-      }
-    } else {  // data is result from another aggregation
-      auto other_agg = std::get<std::unique_ptr<aggregation>>(attr.value)->clone();
+  if (!udf_ptr->callback_input_values) {
+    udf_ptr->callback_input_values = [&]() -> column_view { return values; };
+  }
+  if (!udf_ptr->callback_grouped_values) {
+    udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); };
+  }
+  if (!udf_ptr->callback_sorted_grouped_values) {
+    udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); };
+  }
+  if (!udf_ptr->callback_num_groups) {
+    udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); };
+  }
+  if (!udf_ptr->callback_group_offsets) {
+    udf_ptr->callback_group_offsets = [&]() -> device_span<size_type const> {
+      return helper.group_offsets(stream);
+    };
+  }
+  if (!udf_ptr->callback_group_labels) {
+    udf_ptr->callback_group_labels = [&]() -> device_span<size_type const> {
+      return helper.group_labels(stream);
+    };
+  }
+  if (!udf_ptr->callback_compute_aggregation) {
+    udf_ptr->callback_compute_aggregation =
+      [&](std::unique_ptr<aggregation> other_agg) -> column_view {
       cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg);
-      auto result = cache.get_result(values, *other_agg);
-      udf_input.emplace(std::move(other_agg), std::move(result));
-    }
+      return cache.get_result(values, *other_agg);
+    };
   }
 
-  auto output = (*udf_ptr)(udf_input, stream, mr);
-  CUDF_EXPECTS(std::holds_alternative<std::unique_ptr<column>>(output),
-               "Invalid output type from HOST_UDF groupby aggregation.");
-  cache.add_result(values, agg, std::get<std::unique_ptr<column>>(std::move(output)));
+  cache.add_result(values, agg, (*udf_ptr)(stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp
index 0da47e17f48..6f1fe80c4bd 100644
--- a/cpp/src/groupby/sort/host_udf_aggregation.cpp
+++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,51 +16,9 @@
 
 #include <cudf/aggregation/host_udf.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 
 namespace cudf {
 
-host_udf_base::data_attribute::data_attribute(data_attribute const& other)
-  : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; },
-                                                    [](std::unique_ptr<aggregation> const& val) {
-                                                      return value_type{val->clone()};
-                                                    }},
-                     other.value)}
-{
-}
-
-std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const
-{
-  auto const hash_value =
-    std::visit(cudf::detail::visitor_overload{
-                 [](auto const& val) { return std::hash<int>{}(static_cast<int>(val)); },
-                 [](std::unique_ptr<aggregation> const& val) { return val->do_hash(); }},
-               attr.value);
-  return std::hash<std::size_t>{}(attr.value.index()) ^ hash_value;
-}
-
-bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs,
-                                                         data_attribute const& rhs) const
-{
-  auto const& lhs_val = lhs.value;
-  auto const& rhs_val = rhs.value;
-  if (lhs_val.index() != rhs_val.index()) { return false; }
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [](auto const& lhs_val, auto const& rhs_val) {
-        if constexpr (std::is_same_v<decltype(lhs_val), decltype(rhs_val)>) {
-          return lhs_val == rhs_val;
-        } else {
-          return false;
-        }
-      },
-      [](std::unique_ptr<aggregation> const& lhs_val, std::unique_ptr<aggregation> const& rhs_val) {
-        return lhs_val->is_equal(*rhs_val);
-      }},
-    lhs_val,
-    rhs_val);
-}
-
 namespace detail {
 
 host_udf_aggregation::host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
@@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr<aggregation> make_host_udf_aggregation<aggr
   std::unique_ptr<host_udf_base>);
 template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
   make_host_udf_aggregation<groupby_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_host_udf_aggregation<reduce_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
+  make_host_udf_aggregation<segmented_reduce_aggregation>(std::unique_ptr<host_udf_base>);
 
 }  // namespace cudf
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index 26535bed43b..3800835eaf1 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,45 @@
 
 #include "comp.hpp"
 
+#include "gpuinflate.hpp"
+#include "io/utilities/getenv_or.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "nvcomp_adapter.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <zlib.h>  // GZIP compression
 
 namespace cudf::io::detail {
 
 namespace {
 
+auto& h_comp_pool()
+{
+  static std::size_t pool_size =
+    getenv_or("LIBCUDF_HOST_COMPRESSION_NUM_THREADS", std::thread::hardware_concurrency());
+  static BS::thread_pool pool(pool_size);
+  return pool;
+}
+
+std::optional<nvcomp::compression_type> to_nvcomp_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case compression_type::ZSTD: return nvcomp::compression_type::ZSTD;
+    case compression_type::LZ4: return nvcomp::compression_type::LZ4;
+    case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE;
+    default: return std::nullopt;
+  }
+}
+
 /**
  * @brief GZIP host compressor (includes header)
  */
@@ -98,8 +121,132 @@ std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
   return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
 }
 
+void device_compress(compression_type compression,
+                     device_span<device_span<uint8_t const> const> inputs,
+                     device_span<device_span<uint8_t> const> outputs,
+                     device_span<compression_result> results,
+                     rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  auto nvcomp_disabled   = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type)
+                                                   : "invalid compression type";
+  if (not nvcomp_disabled) {
+    return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream);
+  }
+
+  switch (compression) {
+    case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream);
+    default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value());
+  }
+}
+
+void host_compress(compression_type compression,
+                   device_span<device_span<uint8_t const> const> inputs,
+                   device_span<device_span<uint8_t> const> outputs,
+                   device_span<compression_result> results,
+                   rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const num_chunks = inputs.size();
+  auto h_results        = cudf::detail::make_host_vector<compression_result>(num_chunks, stream);
+  auto const h_inputs   = cudf::detail::make_host_vector_async(inputs, stream);
+  auto const h_outputs  = cudf::detail::make_host_vector_async(outputs, stream);
+  stream.synchronize();
+
+  std::vector<std::future<size_t>> tasks;
+  auto const num_streams =
+    std::min<std::size_t>({num_chunks,
+                           cudf::detail::global_cuda_stream_pool().get_stream_pool_size(),
+                           h_comp_pool().get_thread_count()});
+  auto const streams = cudf::detail::fork_streams(stream, num_streams);
+  for (size_t i = 0; i < num_chunks; ++i) {
+    auto const cur_stream = streams[i % streams.size()];
+    auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t {
+      auto const h_in  = cudf::detail::make_host_vector_sync(d_in, cur_stream);
+      auto const h_out = compress(compression, h_in, cur_stream);
+      cudf::detail::cuda_memcpy<uint8_t>(d_out.subspan(0, h_out.size()), h_out, cur_stream);
+      return h_out.size();
+    };
+    tasks.emplace_back(h_comp_pool().submit_task(std::move(task)));
+  }
+
+  for (auto i = 0ul; i < num_chunks; ++i) {
+    h_results[i] = {tasks[i].get(), compression_status::SUCCESS};
+  }
+  cudf::detail::cuda_memcpy_async<compression_result>(results, h_results, stream);
+}
+
+[[nodiscard]] bool host_compression_supported(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::GZIP:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool device_compression_supported(compression_type compression)
+{
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  switch (compression) {
+    case compression_type::LZ4:
+    case compression_type::ZLIB:
+    case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value());
+    case compression_type::SNAPPY:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool use_host_compression(
+  compression_type compression,
+  [[maybe_unused]] device_span<device_span<uint8_t const> const> inputs,
+  [[maybe_unused]] device_span<device_span<uint8_t> const> outputs)
+{
+  CUDF_EXPECTS(
+    not host_compression_supported(compression) or device_compression_supported(compression),
+    "Unsupported compression type");
+  if (not host_compression_supported(compression)) { return false; }
+  if (not device_compression_supported(compression)) { return true; }
+  // If both host and device compression are supported, use the host if the env var is set
+  return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0);
+}
+
 }  // namespace
 
+std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
+{
+  if (auto nvcomp_type = to_nvcomp_compression(compression);
+      nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type);
+  }
+  return std::nullopt;
+}
+
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression)
+{
+  auto nvcomp_type = to_nvcomp_compression(compression);
+  if (compression == compression_type::NONE or not nvcomp_type.has_value() or
+      nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return 1ul;
+  }
+
+  return nvcomp::required_alignment(*nvcomp_type);
+}
+
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size)
+{
+  if (compression == compression_type::NONE) { return uncompressed_size; }
+
+  if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) {
+    return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size);
+  }
+  CUDF_FAIL("Unsupported compression type");
+}
+
 std::vector<std::uint8_t> compress(compression_type compression,
                                    host_span<uint8_t const> src,
                                    rmm::cuda_stream_view stream)
@@ -112,4 +259,18 @@ std::vector<std::uint8_t> compress(compression_type compression,
   }
 }
 
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (use_host_compression(compression, inputs, outputs)) {
+    return host_compress(compression, inputs, outputs, results, stream);
+  } else {
+    return device_compress(compression, inputs, outputs, results, stream);
+  }
+}
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/comp.cu
similarity index 96%
rename from cpp/src/io/comp/statistics.cu
rename to cpp/src/io/comp/comp.cu
index caee9145d2c..af0f73869a2 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/comp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "gpuinflate.hpp"
+#include "comp.hpp"
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp
index e16f26e1f06..90932a11499 100644
--- a/cpp/src/io/comp/comp.hpp
+++ b/cpp/src/io/comp/comp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,5 +57,57 @@ std::vector<uint8_t> compress(compression_type compression,
                               host_span<uint8_t const> src,
                               rmm::cuda_stream_view stream);
 
+/**
+ * @brief Maximum size of uncompressed chunks that can be compressed.
+ *
+ * @param compression Compression type
+ * @returns maximum chunk size
+ */
+[[nodiscard]] std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression);
+
+/**
+ * @brief Gets input and output alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment
+ */
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression);
+
+/**
+ * @brief Gets the maximum size any chunk could compress to in the batch.
+ *
+ * @param compression Compression type
+ * @param uncompressed_size Size of the largest uncompressed chunk in the batch
+ */
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size);
+
+/**
+ * @brief Compresses device memory buffers.
+ *
+ * @param compression Type of compression of the input data
+ * @param inputs      Device memory buffers to compress
+ * @param outputs     Device memory buffers to store the compressed output
+ * @param results     Compression results
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ */
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Aggregate results of compression into a single statistics object.
+ *
+ * @param inputs List of uncompressed input buffers
+ * @param results List of compression results
+ * @param stream CUDA stream to use
+ * @return writer_compression_statistics
+ */
+[[nodiscard]] writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream);
+
 }  // namespace io::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 4b09bd5a84c..0a35b230242 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,17 +124,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
-/**
- * @brief Aggregate results of compression into a single statistics object.
- *
- * @param inputs List of uncompressed input buffers
- * @param results List of compression results
- * @param stream CUDA stream to use
- * @return writer_compression_statistics
- */
-[[nodiscard]] writer_compression_statistics collect_compression_statistics(
-  device_span<device_span<uint8_t const> const> inputs,
-  device_span<compression_result const> results,
-  rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 88423122e16..d63fa9f5c35 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l
 void parquet_writer_options_base::set_compression(compression_type compression)
 {
   _compression = compression;
+  if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
 }
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index f4e75f78dec..8b30cee6681 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -407,7 +407,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 79ecca0ca99..4f296bb5bfc 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
@@ -45,8 +44,6 @@ namespace io {
 namespace orc {
 namespace gpu {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 using cudf::detail::device_2dspan;
 using cudf::io::detail::compression_result;
 using cudf::io::detail::compression_status;
@@ -1362,7 +1359,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
@@ -1387,47 +1384,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
                                                                             max_comp_blk_size,
                                                                             comp_block_align);
 
-  if (compression == SNAPPY) {
-    try {
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-    } catch (...) {
-      // There was an error in compressing so set an error status for each block
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        comp_res.begin(),
-        comp_res.end(),
-        [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; });
-      // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
-      // writing without compression
-      CUDF_LOG_WARN("ORC writer: compression failed, writing uncompressed data");
-    }
-  } else if (compression == ZLIB) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(
-      nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
-  } else if (compression == ZSTD) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-  } else if (compression == LZ4) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-  } else if (compression != NONE) {
-    CUDF_FAIL("Unsupported compression type");
-  }
+  cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream);
 
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ce868b83c04..aa0b509981a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
@@ -71,8 +70,6 @@
 
 namespace cudf::io::orc::detail {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 template <typename T>
 [[nodiscard]] constexpr int varint_size(T val)
 {
@@ -92,21 +89,8 @@ struct row_group_index_info {
 };
 
 namespace {
-
 /**
- * @brief Translates ORC compression to nvCOMP compression
- */
-auto to_nvcomp_compression_type(CompressionKind compression_kind)
-{
-  if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE;
-  if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD;
-  if (compression_kind == LZ4) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-/**
- * @brief Translates cuDF compression to ORC compression
+ * @brief Translates cuDF compression to ORC compression.
  */
 orc::CompressionKind to_orc_compression(compression_type compression)
 {
@@ -122,19 +106,14 @@ orc::CompressionKind to_orc_compression(compression_type compression)
 }
 
 /**
- * @brief Returns the block size for a given compression kind.
+ * @brief Returns the block size for a given compression format.
  */
-constexpr size_t compression_block_size(orc::CompressionKind compression)
+size_t compression_block_size(compression_type compression)
 {
-  if (compression == orc::CompressionKind::NONE) { return 0; }
-
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
   constexpr size_t max_block_size = 256 * 1024;
-  return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
+  return std::min(comp_limit.value_or(max_block_size), max_block_size);
 }
 
 /**
@@ -534,26 +513,6 @@ size_t RLE_stream_size(TypeKind kind, size_t count)
   }
 }
 
-auto uncomp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
-auto comp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
 /**
  * @brief Builds up per-column streams.
  *
@@ -566,7 +525,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            file_segmentation const& segmentation,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
-                           CompressionKind compression_kind,
+                           compression_type compression,
                            single_write_mode write_mode)
 {
   // 'column 0' row index stream
@@ -610,7 +569,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
+        auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -1473,7 +1432,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
  * @param[in] rg_stats row group level statistics
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] compression_blocksize The block size used for compression
  * @param[in] out_sink Sink for writing data
  */
@@ -1487,7 +1446,7 @@ void write_index_stream(int32_t stripe_id,
                         host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
                         orc_streams* streams,
-                        CompressionKind compression_kind,
+                        compression_type compression,
                         size_t compression_blocksize,
                         std::unique_ptr<data_sink> const& out_sink)
 {
@@ -1501,7 +1460,7 @@ void write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind != NONE) {
+      if (compression != compression_type::NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1541,7 +1500,7 @@ void write_index_stream(int32_t stripe_id,
     }
   }
 
-  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression != compression_type::NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
@@ -1566,7 +1525,7 @@ void write_index_stream(int32_t stripe_id,
   });
 
   (*streams)[stream_id].length = pbw.size();
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
     pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
     pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
@@ -1585,7 +1544,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] out_sink Sink for writing data
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @return An std::future that should be synchronized to ensure the writing is complete
@@ -1596,7 +1555,7 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
-                                    CompressionKind compression_kind,
+                                    compression_type compression,
                                     std::unique_ptr<data_sink> const& out_sink,
                                     rmm::cuda_stream_view stream)
 {
@@ -1606,8 +1565,9 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     return std::async(std::launch::deferred, [] {});
   }
 
-  auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                     : (compressed_data + strm_desc.bfr_offset);
+  auto const* stream_in = (compression == compression_type::NONE)
+                            ? enc_stream.data_ptrs[strm_desc.stream_type]
+                            : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
     if (out_sink->is_device_write_preferred(length)) {
@@ -1627,15 +1587,15 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
 /**
  * @brief Insert 3-byte uncompressed block headers in a byte vector
  *
- * @param compression_kind The compression kind
+ * @param compression The compression kind
  * @param compression_blocksize The block size used for compression
  * @param v The destitation byte vector to write, which must include initial 3-byte header
  */
-void add_uncompressed_block_headers(CompressionKind compression_kind,
+void add_uncompressed_block_headers(compression_type compression,
                                     size_t compression_blocksize,
                                     std::vector<uint8_t>& v)
 {
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
     while (uncomp_len > compression_blocksize) {
       block_len  = compression_blocksize * 2 + 1;
@@ -2021,14 +1981,6 @@ std::map<uint32_t, size_t> decimal_column_sizes(
   return column_sizes;
 }
 
-size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
-{
-  if (compression_kind == NONE) return 0;
-
-  return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
-                                                compression_blocksize);
-}
-
 std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
   auto table_meta = std::make_unique<table_input_metadata>(input);
@@ -2287,7 +2239,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
  * @param row_index_stride The row index stride
  * @param enable_dictionary Whether dictionary is enabled
  * @param sort_dictionaries Whether to sort the dictionaries
- * @param compression_kind The compression kind
+ * @param compression The compression format
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
  * @param collect_compression_stats Flag to indicate if compression statistics should be collected
@@ -2302,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                size_type row_index_stride,
                                bool enable_dictionary,
                                bool sort_dictionaries,
-                               CompressionKind compression_kind,
+                               compression_type compression,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
                                bool collect_compression_stats,
@@ -2329,17 +2281,16 @@ auto convert_table_to_orc_data(table_view const& input,
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+  auto const block_align = compress_required_chunk_alignment(compression);
 
   auto streams  = create_streams(orc_table.columns,
                                 segmentation,
                                 decimal_column_sizes(dec_chunk_sizes.rg_sizes),
                                 enable_dictionary,
-                                compression_kind,
+                                compression,
                                 write_mode);
   auto enc_data = encode_columns(
-    orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream);
+    orc_table, std::move(dec_chunk_sizes), segmentation, streams, block_align, stream);
 
   stripe_dicts.on_encode_complete(stream);
 
@@ -2371,16 +2322,15 @@ auto convert_table_to_orc_data(table_view const& input,
   size_t compressed_bfr_size   = 0;
   size_t num_compressed_blocks = 0;
 
-  auto const max_compressed_block_size =
-    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize);
   auto const padded_max_compressed_block_size =
-    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(max_compressed_block_size, block_align);
   auto const padded_block_header_size =
-    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(block_header_size, block_align);
 
   for (auto& ss : strm_descs.host_view().flat_view()) {
     size_t stream_size = ss.stream_size;
-    if (compression_kind != NONE) {
+    if (compression != compression_type::NONE) {
       ss.first_block = num_compressed_blocks;
       ss.bfr_offset  = compressed_bfr_size;
 
@@ -2401,14 +2351,14 @@ auto convert_table_to_orc_data(table_view const& input,
                comp_results.d_begin(),
                comp_results.d_end(),
                compression_result{0, compression_status::FAILURE});
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     strm_descs.host_to_device_async(stream);
     compression_stats = gpu::CompressOrcDataStreams(compressed_data,
                                                     num_compressed_blocks,
-                                                    compression_kind,
+                                                    compression,
                                                     compression_blocksize,
                                                     max_compressed_block_size,
-                                                    compressed_block_align,
+                                                    block_align,
                                                     collect_compression_stats,
                                                     strm_descs,
                                                     enc_data.streams,
@@ -2459,8 +2409,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2480,8 +2430,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2526,7 +2476,7 @@ void writer::impl::write(table_view const& input)
                                        _row_index_stride,
                                        _enable_dictionary,
                                        _sort_dictionaries,
-                                       _compression_kind,
+                                       _compression,
                                        _compression_blocksize,
                                        _stats_freq,
                                        _compression_statistics != nullptr,
@@ -2613,7 +2563,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                          rg_stats,
                          &stripe,
                          &streams,
-                         _compression_kind,
+                         _compression,
                          _compression_blocksize,
                          _out_sink);
     }
@@ -2627,7 +2577,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         bounce_buffer,
         &stripe,
         &streams,
-        _compression_kind,
+        _compression,
         _out_sink,
         _stream));
     }
@@ -2645,10 +2595,10 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
           : 0;
       if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
     }
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(sf);
     stripe.footerLength = pbw.size();
-    if (_compression_kind != NONE) {
+    if (_compression != compression_type::NONE) {
       uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
       pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
       pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
@@ -2780,21 +2730,21 @@ void writer::impl::close()
 
   // Write statistics metadata
   if (not _orc_meta.stripeStats.empty()) {
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(_orc_meta);
-    add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+    add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
     ps.metadataLength = pbw.size();
     _out_sink->host_write(pbw.data(), pbw.size());
   } else {
     ps.metadataLength = 0;
   }
-  ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
   pbw.write(_footer);
-  add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+  add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
 
   // Write postscript metadata
   ps.footerLength         = pbw.size();
-  ps.compression          = _compression_kind;
+  ps.compression          = to_orc_compression(_compression);
   ps.compressionBlockSize = _compression_blocksize;
   ps.version              = {0, 12};  // Hive 0.12
   ps.writerVersion        = cudf_writer_version;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index cae849ee315..7d23482cb17 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -342,7 +342,7 @@ class writer::impl {
   // Writer options.
   stripe_size_limits const _max_stripe_size;
   size_type const _row_index_stride;
-  CompressionKind const _compression_kind;
+  compression_type const _compression;
   size_t const _compression_blocksize;
   std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu
new file mode 100644
index 00000000000..8c404950efa
--- /dev/null
+++ b/cpp/src/io/parquet/bloom_filter_reader.cu
@@ -0,0 +1,683 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compact_protocol_reader.hpp"
+#include "io/parquet/parquet.hpp"
+#include "reader_impl_helpers.hpp"
+
+#include <cudf/ast/detail/expression_transformer.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/hashing/detail/xxhash_64.cuh>
+#include <cudf/logger.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuco/bloom_filter_policies.cuh>
+#include <cuco/bloom_filter_ref.cuh>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/tabulate.h>
+
+#include <future>
+#include <numeric>
+#include <optional>
+
+namespace cudf::io::parquet::detail {
+namespace {
+
+/**
+ * @brief Converts bloom filter membership results (for each column chunk) to a device column.
+ *
+ */
+struct bloom_filter_caster {
+  cudf::device_span<cudf::device_span<cuda::std::byte> const> bloom_filter_spans;
+  host_span<Type const> parquet_types;
+  size_t total_row_groups;
+  size_t num_equality_columns;
+
+  enum class is_int96_timestamp : bool { YES, NO };
+
+  template <typename T, is_int96_timestamp IS_INT96_TIMESTAMP = is_int96_timestamp::NO>
+  std::unique_ptr<cudf::column> query_bloom_filter(cudf::size_type equality_col_idx,
+                                                   cudf::data_type dtype,
+                                                   ast::literal const* const literal,
+                                                   rmm::cuda_stream_view stream) const
+  {
+    using key_type    = T;
+    using policy_type = cuco::arrow_filter_policy<key_type, cudf::hashing::detail::XXHash_64>;
+    using word_type   = typename policy_type::word_type;
+
+    // Boolean, List, Struct, Dictionary types are not supported
+    if constexpr (std::is_same_v<T, bool> or
+                  (cudf::is_compound<T>() and not std::is_same_v<T, string_view>)) {
+      CUDF_FAIL("Bloom filters do not support boolean or compound types");
+    } else {
+      // Check if the literal has the same type as the predicate column
+      CUDF_EXPECTS(
+        dtype == literal->get_data_type() and
+          cudf::have_same_types(
+            cudf::column_view{dtype, 0, {}, {}, 0, 0, {}},
+            cudf::scalar_type_t<T>(T{}, false, stream, cudf::get_current_device_resource_ref())),
+        "Mismatched predicate column and literal types");
+    }
+
+    // Filter properties
+    auto constexpr bytes_per_block = sizeof(word_type) * policy_type::words_per_block;
+
+    rmm::device_buffer results{total_row_groups, stream, cudf::get_current_device_resource_ref()};
+    cudf::device_span<bool> results_span{static_cast<bool*>(results.data()), total_row_groups};
+
+    // Query literal in bloom filters from each column chunk (row group).
+    thrust::tabulate(
+      rmm::exec_policy_nosync(stream),
+      results_span.begin(),
+      results_span.end(),
+      [filter_span          = bloom_filter_spans.data(),
+       d_scalar             = literal->get_value(),
+       col_idx              = equality_col_idx,
+       num_equality_columns = num_equality_columns] __device__(auto row_group_idx) {
+        // Filter bitset buffer index
+        auto const filter_idx  = col_idx + (num_equality_columns * row_group_idx);
+        auto const filter_size = filter_span[filter_idx].size();
+
+        // If no bloom filter, then fill in `true` as membership cannot be determined
+        if (filter_size == 0) { return true; }
+
+        // Number of filter blocks
+        auto const num_filter_blocks = filter_size / bytes_per_block;
+
+        // Create a bloom filter view.
+        cuco::bloom_filter_ref<key_type,
+                               cuco::extent<std::size_t>,
+                               cuco::thread_scope_thread,
+                               policy_type>
+          filter{reinterpret_cast<word_type*>(filter_span[filter_idx].data()),
+                 num_filter_blocks,
+                 {},  // Thread scope as the same literal is being searched across different bitsets
+                      // per thread
+                 {}};  // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow
+                       // compatibility
+
+        // If int96_timestamp type, convert literal to string_view and query bloom
+        // filter
+        if constexpr (cuda::std::is_same_v<T, cudf::string_view> and
+                      IS_INT96_TIMESTAMP == is_int96_timestamp::YES) {
+          auto const int128_key = static_cast<__int128_t>(d_scalar.value<int64_t>());
+          cudf::string_view probe_key{reinterpret_cast<char const*>(&int128_key), 12};
+          return filter.contains(probe_key);
+        } else {
+          // Query the bloom filter and store results
+          return filter.contains(d_scalar.value<T>());
+        }
+      });
+
+    return std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::BOOL8},
+                                          static_cast<cudf::size_type>(total_row_groups),
+                                          std::move(results),
+                                          rmm::device_buffer{},
+                                          0);
+  }
+
+  // Creates device columns from bloom filter membership
+  template <typename T>
+  std::unique_ptr<cudf::column> operator()(cudf::size_type equality_col_idx,
+                                           cudf::data_type dtype,
+                                           ast::literal* const literal,
+                                           rmm::cuda_stream_view stream) const
+  {
+    // For INT96 timestamps, use cudf::string_view type and set is_int96_timestamp to YES
+    if constexpr (cudf::is_timestamp<T>()) {
+      if (parquet_types[equality_col_idx] == Type::INT96) {
+        // For INT96 timestamps, use cudf::string_view type and set is_int96_timestamp to YES
+        return query_bloom_filter<cudf::string_view, bloom_filter_caster::is_int96_timestamp::YES>(
+          equality_col_idx, dtype, literal, stream);
+      }
+    }
+
+    // For all other cases
+    return query_bloom_filter<T>(equality_col_idx, dtype, literal, stream);
+  }
+};
+
+/**
+ * @brief Collects lists of equality predicate literals in the AST expression, one list per input
+ * table column. This is used in row group filtering based on bloom filters.
+ */
+class equality_literals_collector : public ast::detail::expression_transformer {
+ public:
+  equality_literals_collector() = default;
+
+  equality_literals_collector(ast::expression const& expr, cudf::size_type num_input_columns)
+    : _num_input_columns{num_input_columns}
+  {
+    _equality_literals.resize(_num_input_columns);
+    expr.accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
+  {
+    return expr;
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
+  {
+    CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT,
+                 "BloomfilterAST supports only left table");
+    CUDF_EXPECTS(expr.get_column_index() < _num_input_columns,
+                 "Column index cannot be more than number of columns in the table");
+    return expr;
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override
+  {
+    CUDF_FAIL("Column name reference is not supported in BloomfilterAST");
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    using cudf::ast::ast_operator;
+    auto const operands = expr.get_operands();
+    auto const op       = expr.get_operator();
+
+    if (auto* v = dynamic_cast<ast::column_reference const*>(&operands[0].get())) {
+      // First operand should be column reference, second should be literal.
+      CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2,
+                   "Only binary operations are supported on column reference");
+      auto const literal_ptr = dynamic_cast<ast::literal const*>(&operands[1].get());
+      CUDF_EXPECTS(literal_ptr != nullptr,
+                   "Second operand of binary operation with column reference must be a literal");
+      v->accept(*this);
+
+      // Push to the corresponding column's literals list iff equality predicate is seen
+      if (op == ast_operator::EQUAL) {
+        auto const col_idx = v->get_column_index();
+        _equality_literals[col_idx].emplace_back(const_cast<ast::literal*>(literal_ptr));
+      }
+    } else {
+      // Just visit the operands and ignore any output
+      std::ignore = visit_operands(operands);
+    }
+
+    return expr;
+  }
+
+  /**
+   * @brief Vectors of equality literals in the AST expression, one per input table column
+   *
+   * @return Vectors of equality literals, one per input table column
+   */
+  [[nodiscard]] std::vector<std::vector<ast::literal*>> get_equality_literals() &&
+  {
+    return std::move(_equality_literals);
+  }
+
+ private:
+  std::vector<std::vector<ast::literal*>> _equality_literals;
+
+ protected:
+  std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
+    cudf::host_span<std::reference_wrapper<ast::expression const> const> operands)
+  {
+    std::vector<std::reference_wrapper<ast::expression const>> transformed_operands;
+    for (auto const& operand : operands) {
+      auto const new_operand = operand.get().accept(*this);
+      transformed_operands.push_back(new_operand);
+    }
+    return transformed_operands;
+  }
+  size_type _num_input_columns;
+};
+
+/**
+ * @brief Converts AST expression to bloom filter membership (BloomfilterAST) expression.
+ * This is used in row group filtering based on equality predicate.
+ */
+class bloom_filter_expression_converter : public equality_literals_collector {
+ public:
+  bloom_filter_expression_converter(
+    ast::expression const& expr,
+    size_type num_input_columns,
+    cudf::host_span<std::vector<ast::literal*> const> equality_literals)
+    : _equality_literals{equality_literals}
+  {
+    // Set the num columns
+    _num_input_columns = num_input_columns;
+
+    // Compute and store columns literals offsets
+    _col_literals_offsets.reserve(_num_input_columns + 1);
+    _col_literals_offsets.emplace_back(0);
+
+    std::transform(equality_literals.begin(),
+                   equality_literals.end(),
+                   std::back_inserter(_col_literals_offsets),
+                   [&](auto const& col_literal_map) {
+                     return _col_literals_offsets.back() +
+                            static_cast<cudf::size_type>(col_literal_map.size());
+                   });
+
+    // Add this visitor
+    expr.accept(*this);
+  }
+
+  /**
+   * @brief Delete equality literals getter as it's not needed in the derived class
+   */
+  [[nodiscard]] std::vector<std::vector<ast::literal*>> get_equality_literals() && = delete;
+
+  // Bring all overloads of `visit` from equality_predicate_collector into scope
+  using equality_literals_collector::visit;
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    using cudf::ast::ast_operator;
+    auto const operands = expr.get_operands();
+    auto const op       = expr.get_operator();
+
+    if (auto* v = dynamic_cast<ast::column_reference const*>(&operands[0].get())) {
+      // First operand should be column reference, second should be literal.
+      CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2,
+                   "Only binary operations are supported on column reference");
+      CUDF_EXPECTS(dynamic_cast<ast::literal const*>(&operands[1].get()) != nullptr,
+                   "Second operand of binary operation with column reference must be a literal");
+      v->accept(*this);
+
+      if (op == ast_operator::EQUAL) {
+        // Search the literal in this input column's equality literals list and add to the offset.
+        auto const col_idx            = v->get_column_index();
+        auto const& equality_literals = _equality_literals[col_idx];
+        auto col_literal_offset       = _col_literals_offsets[col_idx];
+        auto const literal_iter       = std::find(equality_literals.cbegin(),
+                                            equality_literals.cend(),
+                                            dynamic_cast<ast::literal const*>(&operands[1].get()));
+        CUDF_EXPECTS(literal_iter != equality_literals.end(), "Could not find the literal ptr");
+        col_literal_offset += std::distance(equality_literals.cbegin(), literal_iter);
+
+        // Evaluate boolean is_true(value) expression as NOT(NOT(value))
+        auto const& value = _bloom_filter_expr.push(ast::column_reference{col_literal_offset});
+        _bloom_filter_expr.push(ast::operation{
+          ast_operator::NOT, _bloom_filter_expr.push(ast::operation{ast_operator::NOT, value})});
+      }
+      // For all other expressions, push an always true expression
+      else {
+        _bloom_filter_expr.push(
+          ast::operation{ast_operator::NOT,
+                         _bloom_filter_expr.push(ast::operation{ast_operator::NOT, _always_true})});
+      }
+    } else {
+      auto new_operands = visit_operands(operands);
+      if (cudf::ast::detail::ast_operator_arity(op) == 2) {
+        _bloom_filter_expr.push(ast::operation{op, new_operands.front(), new_operands.back()});
+      } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
+        _bloom_filter_expr.push(ast::operation{op, new_operands.front()});
+      }
+    }
+    return _bloom_filter_expr.back();
+  }
+
+  /**
+   * @brief Returns the AST to apply on bloom filter membership.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::reference_wrapper<ast::expression const> get_bloom_filter_expr() const
+  {
+    return _bloom_filter_expr.back();
+  }
+
+ private:
+  std::vector<cudf::size_type> _col_literals_offsets;
+  cudf::host_span<std::vector<ast::literal*> const> _equality_literals;
+  ast::tree _bloom_filter_expr;
+  cudf::numeric_scalar<bool> _always_true_scalar{true};
+  ast::literal const _always_true{_always_true_scalar};
+};
+
+/**
+ * @brief Reads bloom filter data to device.
+ *
+ * @param sources Dataset sources
+ * @param num_chunks Number of total column chunks to read
+ * @param bloom_filter_data Device buffers to hold bloom filter bitsets for each chunk
+ * @param bloom_filter_offsets Bloom filter offsets for all chunks
+ * @param bloom_filter_sizes Bloom filter sizes for all chunks
+ * @param chunk_source_map Association between each column chunk and its source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void read_bloom_filter_data(host_span<std::unique_ptr<datasource> const> sources,
+                            size_t num_chunks,
+                            cudf::host_span<rmm::device_buffer> bloom_filter_data,
+                            cudf::host_span<std::optional<int64_t>> bloom_filter_offsets,
+                            cudf::host_span<std::optional<int32_t>> bloom_filter_sizes,
+                            std::vector<size_type> const& chunk_source_map,
+                            rmm::cuda_stream_view stream)
+{
+  // Read tasks for bloom filter data
+  std::vector<std::future<size_t>> read_tasks;
+
+  // Read bloom filters for all column chunks
+  std::for_each(
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator(num_chunks),
+    [&](auto const chunk) {
+      // If bloom filter offset absent, fill in an empty buffer and skip ahead
+      if (not bloom_filter_offsets[chunk].has_value()) {
+        bloom_filter_data[chunk] = {};
+        return;
+      }
+      // Read bloom filter iff present
+      auto const bloom_filter_offset = bloom_filter_offsets[chunk].value();
+
+      // If Bloom filter size (header + bitset) is available, just read the entire thing.
+      // Else just read 256 bytes which will contain the entire header and may contain the
+      // entire bitset as well.
+      auto constexpr bloom_filter_size_guess = 256;
+      auto const initial_read_size =
+        static_cast<size_t>(bloom_filter_sizes[chunk].value_or(bloom_filter_size_guess));
+
+      // Read an initial buffer from source
+      auto& source = sources[chunk_source_map[chunk]];
+      auto buffer  = source->host_read(bloom_filter_offset, initial_read_size);
+
+      // Deserialize the Bloom filter header from the buffer.
+      BloomFilterHeader header;
+      CompactProtocolReader cp{buffer->data(), buffer->size()};
+      cp.read(&header);
+
+      // Get the hardcoded words_per_block value from `cuco::arrow_filter_policy` using a temporary
+      // `std::byte` key type.
+      auto constexpr words_per_block =
+        cuco::arrow_filter_policy<cuda::std::byte,
+                                  cudf::hashing::detail::XXHash_64>::words_per_block;
+
+      // Check if the bloom filter header is valid.
+      auto const is_header_valid =
+        (header.num_bytes % words_per_block) == 0 and
+        header.compression.compression == BloomFilterCompression::Compression::UNCOMPRESSED and
+        header.algorithm.algorithm == BloomFilterAlgorithm::Algorithm::SPLIT_BLOCK and
+        header.hash.hash == BloomFilterHash::Hash::XXHASH;
+
+      // Do not read if the bloom filter is invalid
+      if (not is_header_valid) {
+        bloom_filter_data[chunk] = {};
+        CUDF_LOG_WARN("Encountered an invalid bloom filter header. Skipping");
+        return;
+      }
+
+      // Bloom filter header size
+      auto const bloom_filter_header_size = static_cast<int64_t>(cp.bytecount());
+      auto const bitset_size              = static_cast<size_t>(header.num_bytes);
+
+      // Check if we already read in the filter bitset in the initial read.
+      if (initial_read_size >= bloom_filter_header_size + bitset_size) {
+        bloom_filter_data[chunk] =
+          rmm::device_buffer{buffer->data() + bloom_filter_header_size, bitset_size, stream};
+      }
+      // Read the bitset from datasource.
+      else {
+        auto const bitset_offset = bloom_filter_offset + bloom_filter_header_size;
+        // Directly read to device if preferred
+        if (source->is_device_read_preferred(bitset_size)) {
+          bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream};
+          auto future_read_size =
+            source->device_read_async(bitset_offset,
+                                      bitset_size,
+                                      static_cast<uint8_t*>(bloom_filter_data[chunk].data()),
+                                      stream);
+
+          read_tasks.emplace_back(std::move(future_read_size));
+        } else {
+          buffer                   = source->host_read(bitset_offset, bitset_size);
+          bloom_filter_data[chunk] = rmm::device_buffer{buffer->data(), buffer->size(), stream};
+        }
+      }
+    });
+
+  // Read task sync function
+  for (auto& task : read_tasks) {
+    task.wait();
+  }
+}
+
+}  // namespace
+
+std::vector<rmm::device_buffer> aggregate_reader_metadata::read_bloom_filters(
+  host_span<std::unique_ptr<datasource> const> sources,
+  host_span<std::vector<size_type> const> row_group_indices,
+  host_span<int const> column_schemas,
+  size_type total_row_groups,
+  rmm::cuda_stream_view stream) const
+{
+  // Descriptors for all the chunks that make up the selected columns
+  auto const num_input_columns = column_schemas.size();
+  auto const num_chunks        = total_row_groups * num_input_columns;
+
+  // Association between each column chunk and its source
+  std::vector<size_type> chunk_source_map(num_chunks);
+
+  // Keep track of column chunk file offsets
+  std::vector<std::optional<int64_t>> bloom_filter_offsets(num_chunks);
+  std::vector<std::optional<int32_t>> bloom_filter_sizes(num_chunks);
+
+  // Gather all bloom filter offsets and sizes.
+  size_type chunk_count = 0;
+
+  // Flag to check if we have at least one valid bloom filter offset
+  auto have_bloom_filters = false;
+
+  // For all data sources
+  std::for_each(thrust::counting_iterator<size_t>(0),
+                thrust::counting_iterator(row_group_indices.size()),
+                [&](auto const src_index) {
+                  // Get all row group indices in the data source
+                  auto const& rg_indices = row_group_indices[src_index];
+                  // For all row groups
+                  std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto const rg_index) {
+                    // For all column chunks
+                    std::for_each(
+                      column_schemas.begin(), column_schemas.end(), [&](auto const schema_idx) {
+                        auto& col_meta = get_column_metadata(rg_index, src_index, schema_idx);
+
+                        // Get bloom filter offsets and sizes
+                        bloom_filter_offsets[chunk_count] = col_meta.bloom_filter_offset;
+                        bloom_filter_sizes[chunk_count]   = col_meta.bloom_filter_length;
+
+                        // Set `have_bloom_filters` if `bloom_filter_offset` is valid
+                        if (col_meta.bloom_filter_offset.has_value()) { have_bloom_filters = true; }
+
+                        // Map each column chunk to its source index
+                        chunk_source_map[chunk_count] = src_index;
+                        chunk_count++;
+                      });
+                  });
+                });
+
+  // Exit early if we don't have any bloom filters
+  if (not have_bloom_filters) { return {}; }
+
+  // Vector to hold bloom filter data
+  std::vector<rmm::device_buffer> bloom_filter_data(num_chunks);
+
+  // Read bloom filter data
+  read_bloom_filter_data(sources,
+                         num_chunks,
+                         bloom_filter_data,
+                         bloom_filter_offsets,
+                         bloom_filter_sizes,
+                         chunk_source_map,
+                         stream);
+
+  // Return bloom filter data
+  return bloom_filter_data;
+}
+
+std::vector<Type> aggregate_reader_metadata::get_parquet_types(
+  host_span<std::vector<size_type> const> row_group_indices,
+  host_span<int const> column_schemas) const
+{
+  std::vector<Type> parquet_types(column_schemas.size());
+  // Find a source with at least one row group
+  auto const src_iter = std::find_if(row_group_indices.begin(),
+                                     row_group_indices.end(),
+                                     [](auto const& rg) { return rg.size() > 0; });
+  CUDF_EXPECTS(src_iter != row_group_indices.end(), "");
+
+  // Source index
+  auto const src_index = std::distance(row_group_indices.begin(), src_iter);
+  std::transform(column_schemas.begin(),
+                 column_schemas.end(),
+                 parquet_types.begin(),
+                 [&](auto const schema_idx) {
+                   // Use the first row group in this source
+                   auto constexpr row_group_index = 0;
+                   return get_column_metadata(row_group_index, src_index, schema_idx).type;
+                 });
+
+  return parquet_types;
+}
+
+std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::apply_bloom_filters(
+  host_span<std::unique_ptr<datasource> const> sources,
+  host_span<std::vector<size_type> const> input_row_group_indices,
+  host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
+  std::reference_wrapper<ast::expression const> filter,
+  rmm::cuda_stream_view stream) const
+{
+  // Number of input table columns
+  auto const num_input_columns = static_cast<cudf::size_type>(output_dtypes.size());
+
+  // Total number of row groups after StatsAST filtration
+  auto const total_row_groups = std::accumulate(
+    input_row_group_indices.begin(),
+    input_row_group_indices.end(),
+    size_t{0},
+    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
+
+  // Check if we have less than 2B total row groups.
+  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+               "Total number of row groups exceed the size_type's limit");
+
+  // Collect equality literals for each input table column
+  auto const equality_literals =
+    equality_literals_collector{filter.get(), num_input_columns}.get_equality_literals();
+
+  // Collect schema indices of columns with equality predicate(s)
+  std::vector<cudf::size_type> equality_col_schemas;
+  thrust::copy_if(thrust::host,
+                  output_column_schemas.begin(),
+                  output_column_schemas.end(),
+                  equality_literals.begin(),
+                  std::back_inserter(equality_col_schemas),
+                  [](auto& eq_literals) { return not eq_literals.empty(); });
+
+  // Return early if no column with equality predicate(s)
+  if (equality_col_schemas.empty()) { return std::nullopt; }
+
+  // Read a vector of bloom filter bitset device buffers for all columns with equality
+  // predicate(s) across all row groups
+  auto bloom_filter_data = read_bloom_filters(
+    sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream);
+
+  // No bloom filter buffers, return the original row group indices
+  if (bloom_filter_data.empty()) { return std::nullopt; }
+
+  // Get parquet types for the predicate columns
+  auto const parquet_types = get_parquet_types(input_row_group_indices, equality_col_schemas);
+
+  // Create spans from bloom filter bitset buffers to use in cuco::bloom_filter_ref.
+  std::vector<cudf::device_span<cuda::std::byte>> h_bloom_filter_spans;
+  h_bloom_filter_spans.reserve(bloom_filter_data.size());
+  std::transform(bloom_filter_data.begin(),
+                 bloom_filter_data.end(),
+                 std::back_inserter(h_bloom_filter_spans),
+                 [&](auto& buffer) {
+                   return cudf::device_span<cuda::std::byte>{
+                     static_cast<cuda::std::byte*>(buffer.data()), buffer.size()};
+                 });
+
+  // Copy bloom filter bitset spans to device
+  auto const bloom_filter_spans = cudf::detail::make_device_uvector_async(
+    h_bloom_filter_spans, stream, cudf::get_current_device_resource_ref());
+
+  // Create a bloom filter query table caster
+  bloom_filter_caster const bloom_filter_col{
+    bloom_filter_spans, parquet_types, total_row_groups, equality_col_schemas.size()};
+
+  // Converts bloom filter membership for equality predicate columns to a table
+  // containing a column for each `col[i] == literal` predicate to be evaluated.
+  // The table contains #sources * #column_chunks_per_src rows.
+  std::vector<std::unique_ptr<cudf::column>> bloom_filter_membership_columns;
+  size_t equality_col_idx = 0;
+  std::for_each(
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator(output_dtypes.size()),
+    [&](auto input_col_idx) {
+      auto const& dtype = output_dtypes[input_col_idx];
+
+      // Skip if no equality literals for this column
+      if (equality_literals[input_col_idx].empty()) { return; }
+
+      // Skip if non-comparable (compound) type except string
+      if (cudf::is_compound(dtype) and dtype.id() != cudf::type_id::STRING) { return; }
+
+      // Add a column for all literals associated with an equality column
+      for (auto const& literal : equality_literals[input_col_idx]) {
+        bloom_filter_membership_columns.emplace_back(cudf::type_dispatcher<dispatch_storage_type>(
+          dtype, bloom_filter_col, equality_col_idx, dtype, literal, stream));
+      }
+      equality_col_idx++;
+    });
+
+  // Create a table from columns
+  auto bloom_filter_membership_table = cudf::table(std::move(bloom_filter_membership_columns));
+
+  // Convert AST to BloomfilterAST expression with reference to bloom filter membership
+  // in above `bloom_filter_membership_table`
+  bloom_filter_expression_converter bloom_filter_expr{
+    filter.get(), num_input_columns, {equality_literals}};
+
+  // Filter bloom filter membership table with the BloomfilterAST expression and collect
+  // filtered row group indices
+  return collect_filtered_row_group_indices(bloom_filter_membership_table,
+                                            bloom_filter_expr.get_bloom_filter_expr(),
+                                            input_row_group_indices,
+                                            stream);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index f1ecf66c29f..b8e72aaac88 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -658,6 +658,33 @@ void CompactProtocolReader::read(ColumnChunk* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(BloomFilterAlgorithm* alg)
+{
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, alg->algorithm));
+  function_builder(this, op);
+}
+
+void CompactProtocolReader::read(BloomFilterHash* hash)
+{
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, hash->hash));
+  function_builder(this, op);
+}
+
+void CompactProtocolReader::read(BloomFilterCompression* comp)
+{
+  auto op = std::make_tuple(parquet_field_union_enumerator(1, comp->compression));
+  function_builder(this, op);
+}
+
+void CompactProtocolReader::read(BloomFilterHeader* bf)
+{
+  auto op = std::make_tuple(parquet_field_int32(1, bf->num_bytes),
+                            parquet_field_struct(2, bf->algorithm),
+                            parquet_field_struct(3, bf->hash),
+                            parquet_field_struct(4, bf->compression));
+  function_builder(this, op);
+}
+
 void CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   using optional_size_statistics =
@@ -665,7 +692,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
   using optional_list_enc_stats =
     parquet_field_optional<std::vector<PageEncodingStats>,
                            parquet_field_struct_list<PageEncodingStats>>;
-  auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
+  using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_i32 = parquet_field_optional<int32_t, parquet_field_int32>;
+  auto op            = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
                             parquet_field_enum<Compression>(4, c->codec),
@@ -677,6 +706,8 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(11, c->dictionary_page_offset),
                             parquet_field_struct(12, c->statistics),
                             optional_list_enc_stats(13, c->encoding_stats),
+                            optional_i64(14, c->bloom_filter_offset),
+                            optional_i32(15, c->bloom_filter_length),
                             optional_size_statistics(16, c->size_statistics));
   function_builder(this, op);
 }
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index b87f2e9c692..360197b19ad 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,6 +108,10 @@ class CompactProtocolReader {
   void read(IntType* t);
   void read(RowGroup* r);
   void read(ColumnChunk* c);
+  void read(BloomFilterAlgorithm* bf);
+  void read(BloomFilterHash* bf);
+  void read(BloomFilterCompression* bf);
+  void read(BloomFilterHeader* bf);
   void read(ColumnChunkMetaData* c);
   void read(PageHeader* p);
   void read(DataPageHeader* d);
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 2851ef67a65..dc0c4b1540e 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -382,12 +382,62 @@ struct ColumnChunkMetaData {
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
   std::optional<std::vector<PageEncodingStats>> encoding_stats;
+  // Byte offset from beginning of file to Bloom filter data.
+  std::optional<int64_t> bloom_filter_offset;
+  // Size of Bloom filter data including the serialized header, in bytes. Added in 2.10 so readers
+  // may not read this field from old files and it can be obtained after the BloomFilterHeader has
+  // been deserialized. Writers should write this field so readers can read the bloom filter in a
+  // single I/O.
+  std::optional<int32_t> bloom_filter_length;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
   std::optional<SizeStatistics> size_statistics;
 };
 
+/**
+ * @brief The algorithm used in bloom filter
+ */
+struct BloomFilterAlgorithm {
+  // Block-based Bloom filter.
+  enum class Algorithm { UNDEFINED, SPLIT_BLOCK };
+  Algorithm algorithm{Algorithm::SPLIT_BLOCK};
+};
+
+/**
+ * @brief The hash function used in Bloom filter
+ */
+struct BloomFilterHash {
+  // xxHash_64
+  enum class Hash { UNDEFINED, XXHASH };
+  Hash hash{Hash::XXHASH};
+};
+
+/**
+ * @brief The compression used in the bloom filter
+ */
+struct BloomFilterCompression {
+  enum class Compression { UNDEFINED, UNCOMPRESSED };
+  Compression compression{Compression::UNCOMPRESSED};
+};
+
+/**
+ * @brief Bloom filter header struct
+ *
+ * The bloom filter data of a column chunk stores this header at the beginning
+ * following by the filter bitset.
+ */
+struct BloomFilterHeader {
+  // The size of bitset in bytes
+  int32_t num_bytes;
+  // The algorithm for setting bits
+  BloomFilterAlgorithm algorithm;
+  // The hash function used for bloom filter
+  BloomFilterHash hash;
+  // The compression used in the bloom filter
+  BloomFilterCompression compression;
+};
+
 /**
  * @brief Thrift-derived struct describing a chunk of data for a particular
  * column
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9047ff9169b..0e307bac097 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
+#include <limits>
 #include <numeric>
 #include <optional>
 #include <unordered_set>
@@ -388,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 }  // namespace
 
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
+  host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
   host_span<int const> output_column_schemas,
@@ -396,7 +398,6 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
 {
   auto mr = cudf::get_current_device_resource_ref();
   // Create row group indices.
-  std::vector<std::vector<size_type>> filtered_row_group_indices;
   std::vector<std::vector<size_type>> all_row_group_indices;
   host_span<std::vector<size_type> const> input_row_group_indices;
   if (row_group_indices.empty()) {
@@ -412,18 +413,22 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   } else {
     input_row_group_indices = row_group_indices;
   }
-  auto const total_row_groups = std::accumulate(input_row_group_indices.begin(),
-                                                input_row_group_indices.end(),
-                                                0,
-                                                [](size_type sum, auto const& per_file_row_groups) {
-                                                  return sum + per_file_row_groups.size();
-                                                });
+  auto const total_row_groups = std::accumulate(
+    input_row_group_indices.begin(),
+    input_row_group_indices.end(),
+    size_t{0},
+    [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); });
+
+  // Check if we have less than 2B total row groups.
+  CUDF_EXPECTS(total_row_groups <= std::numeric_limits<cudf::size_type>::max(),
+               "Total number of row groups exceed the size_type's limit");
 
   // Converts Column chunk statistics to a table
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
   // For each column, it contains #sources * #column_chunks_per_src rows.
   std::vector<std::unique_ptr<column>> columns;
-  stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
+  stats_caster const stats_col{
+    static_cast<size_type>(total_row_groups), per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
     auto const schema_idx = output_column_schemas[col_idx];
     auto const& dtype     = output_dtypes[col_idx];
@@ -452,44 +457,23 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
                "Filter expression must return a boolean column");
 
-  auto const host_bitmask = [&] {
-    auto const num_bitmasks = num_bitmask_words(predicate.size());
-    if (predicate.nullable()) {
-      return cudf::detail::make_host_vector_sync(
-        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
-    } else {
-      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
-      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
-      return bitmask;
-    }
-  }();
+  // Filter stats table with StatsAST expression and collect filtered row group indices
+  auto const filtered_row_group_indices = collect_filtered_row_group_indices(
+    stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream);
 
-  auto validity_it = cudf::detail::make_counting_transform_iterator(
-    0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
+  // Span of row groups to apply bloom filtering on.
+  auto const bloom_filter_input_row_groups =
+    filtered_row_group_indices.has_value()
+      ? host_span<std::vector<size_type> const>(filtered_row_group_indices.value())
+      : input_row_group_indices;
 
-  auto const is_row_group_required = cudf::detail::make_host_vector_sync(
-    device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
+  // Apply bloom filtering on the bloom filter input row groups
+  auto const bloom_filtered_row_groups = apply_bloom_filters(
+    sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream);
 
-  // Return only filtered row groups based on predicate
-  // if all are required or all are nulls, return.
-  if (std::all_of(is_row_group_required.cbegin(),
-                  is_row_group_required.cend(),
-                  [](auto i) { return bool(i); }) or
-      predicate.null_count() == predicate.size()) {
-    return std::nullopt;
-  }
-  size_type is_required_idx = 0;
-  for (auto const& input_row_group_index : input_row_group_indices) {
-    std::vector<size_type> filtered_row_groups;
-    for (auto const rg_idx : input_row_group_index) {
-      if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
-        filtered_row_groups.push_back(rg_idx);
-      }
-      ++is_required_idx;
-    }
-    filtered_row_group_indices.push_back(std::move(filtered_row_groups));
-  }
-  return {std::move(filtered_row_group_indices)};
+  // Return bloom filtered row group indices iff collected
+  return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups
+                                               : filtered_row_group_indices;
 }
 
 // convert column named expression to column index reference expression
@@ -510,14 +494,14 @@ named_to_reference_converter::named_to_reference_converter(
 std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
   ast::literal const& expr)
 {
-  _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+  _converted_expr = std::reference_wrapper<ast::expression const>(expr);
   return expr;
 }
 
 std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
   ast::column_reference const& expr)
 {
-  _stats_expr = std::reference_wrapper<ast::expression const>(expr);
+  _converted_expr = std::reference_wrapper<ast::expression const>(expr);
   return expr;
 }
 
@@ -531,7 +515,7 @@ std::reference_wrapper<ast::expression const> named_to_reference_converter::visi
   }
   auto col_index = col_index_it->second;
   _col_ref.emplace_back(col_index);
-  _stats_expr = std::reference_wrapper<ast::expression const>(_col_ref.back());
+  _converted_expr = std::reference_wrapper<ast::expression const>(_col_ref.back());
   return std::reference_wrapper<ast::expression const>(_col_ref.back());
 }
 
@@ -546,7 +530,7 @@ std::reference_wrapper<ast::expression const> named_to_reference_converter::visi
   } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
     _operators.emplace_back(op, new_operands.front());
   }
-  _stats_expr = std::reference_wrapper<ast::expression const>(_operators.back());
+  _converted_expr = std::reference_wrapper<ast::expression const>(_operators.back());
   return std::reference_wrapper<ast::expression const>(_operators.back());
 }
 
@@ -640,4 +624,60 @@ class names_from_expression : public ast::detail::expression_transformer {
   return names_from_expression(expr, skip_names).to_vector();
 }
 
+std::optional<std::vector<std::vector<size_type>>> collect_filtered_row_group_indices(
+  cudf::table_view table,
+  std::reference_wrapper<ast::expression const> ast_expr,
+  host_span<std::vector<size_type> const> input_row_group_indices,
+  rmm::cuda_stream_view stream)
+{
+  // Filter the input table using AST expression
+  auto predicate_col = cudf::detail::compute_column(
+    table, ast_expr.get(), stream, cudf::get_current_device_resource_ref());
+  auto predicate = predicate_col->view();
+  CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8,
+               "Filter expression must return a boolean column");
+
+  auto const host_bitmask = [&] {
+    auto const num_bitmasks = num_bitmask_words(predicate.size());
+    if (predicate.nullable()) {
+      return cudf::detail::make_host_vector_sync(
+        device_span<bitmask_type const>(predicate.null_mask(), num_bitmasks), stream);
+    } else {
+      auto bitmask = cudf::detail::make_host_vector<bitmask_type>(num_bitmasks, stream);
+      std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0});
+      return bitmask;
+    }
+  }();
+
+  auto validity_it = cudf::detail::make_counting_transform_iterator(
+    0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
+
+  // Return only filtered row groups based on predicate
+  auto const is_row_group_required = cudf::detail::make_host_vector_sync(
+    device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
+
+  // Return if all are required, or all are nulls.
+  if (predicate.null_count() == predicate.size() or std::all_of(is_row_group_required.cbegin(),
+                                                                is_row_group_required.cend(),
+                                                                [](auto i) { return bool(i); })) {
+    return std::nullopt;
+  }
+
+  // Collect indices of the filtered row groups
+  size_type is_required_idx = 0;
+  std::vector<std::vector<size_type>> filtered_row_group_indices;
+  for (auto const& input_row_group_index : input_row_group_indices) {
+    std::vector<size_type> filtered_row_groups;
+    for (auto const rg_idx : input_row_group_index) {
+      if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
+        filtered_row_groups.push_back(rg_idx);
+      }
+      ++is_required_idx;
+    }
+    filtered_row_group_indices.push_back(std::move(filtered_row_groups));
+  }
+
+  return {filtered_row_group_indices};
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 0dd1aff41e9..25baa1e0ec8 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1030,6 +1030,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
 
 std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
 aggregate_reader_metadata::select_row_groups(
+  host_span<std::unique_ptr<datasource> const> sources,
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
@@ -1042,7 +1043,7 @@ aggregate_reader_metadata::select_row_groups(
   // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
     filtered_row_group_indices = filter_row_groups(
-      row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
+      sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index fd692c0cdd6..a28ce616e2c 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -195,6 +195,38 @@ class aggregate_reader_metadata {
    */
   void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const;
 
+  /**
+   * @brief Reads bloom filter bitsets for the specified columns from the given lists of row
+   * groups.
+   *
+   * @param sources Dataset sources
+   * @param row_group_indices Lists of row groups to read bloom filters from, one per source
+   * @param[out] bloom_filter_data List of bloom filter data device buffers
+   * @param column_schemas Schema indices of columns whose bloom filters will be read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   * @return A flattened list of bloom filter bitset device buffers for each predicate column across
+   * row group
+   */
+  [[nodiscard]] std::vector<rmm::device_buffer> read_bloom_filters(
+    host_span<std::unique_ptr<datasource> const> sources,
+    host_span<std::vector<size_type> const> row_group_indices,
+    host_span<int const> column_schemas,
+    size_type num_row_groups,
+    rmm::cuda_stream_view stream) const;
+
+  /**
+   * @brief Collects Parquet types for the columns with the specified schema indices
+   *
+   * @param row_group_indices Lists of row groups, once per source
+   * @param column_schemas Schema indices of columns whose types will be collected
+   *
+   * @return A list of parquet types for the columns matching the provided schema indices
+   */
+  [[nodiscard]] std::vector<Type> get_parquet_types(
+    host_span<std::vector<size_type> const> row_group_indices,
+    host_span<int const> column_schemas) const;
+
  public:
   aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
                             bool use_arrow_schema,
@@ -323,26 +355,49 @@ class aggregate_reader_metadata {
   /**
    * @brief Filters the row groups based on predicate filter
    *
+   * @param sources Lists of input datasources
    * @param row_group_indices Lists of row groups to read, one per source
-   * @param output_dtypes Datatypes of of output columns
+   * @param output_dtypes Datatypes of output columns
    * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return Filtered row group indices, if any is filtered.
+   * @return Filtered row group indices, if any is filtered
    */
   [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
+    host_span<std::unique_ptr<datasource> const> sources,
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<data_type const> output_dtypes,
     host_span<int const> output_column_schemas,
     std::reference_wrapper<ast::expression const> filter,
     rmm::cuda_stream_view stream) const;
 
+  /**
+   * @brief Filters the row groups using bloom filters
+   *
+   * @param sources Dataset sources
+   * @param row_group_indices Lists of input row groups to read, one per source
+   * @param output_dtypes Datatypes of output columns
+   * @param output_column_schemas schema indices of output columns
+   * @param filter AST expression to filter row groups based on bloom filter membership
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   *
+   * @return Filtered row group indices, if any is filtered
+   */
+  [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> apply_bloom_filters(
+    host_span<std::unique_ptr<datasource> const> sources,
+    host_span<std::vector<size_type> const> input_row_group_indices,
+    host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
+    std::reference_wrapper<ast::expression const> filter,
+    rmm::cuda_stream_view stream) const;
+
   /**
    * @brief Filters and reduces down to a selection of row groups
    *
    * The input `row_start` and `row_count` parameters will be recomputed and output as the valid
    * values based on the input row group list.
    *
+   * @param sources Lists of input datasources
    * @param row_group_indices Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
@@ -351,10 +406,11 @@ class aggregate_reader_metadata {
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count, list of row group indexes and its
-   *         starting row, and list of number of rows per source.
+   *         starting row, and list of number of rows per source
    */
   [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
-  select_row_groups(host_span<std::vector<size_type> const> row_group_indices,
+  select_row_groups(host_span<std::unique_ptr<datasource> const> sources,
+                    host_span<std::vector<size_type> const> row_group_indices,
                     int64_t row_start,
                     std::optional<size_type> const& row_count,
                     host_span<data_type const> output_dtypes,
@@ -413,14 +469,14 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override;
 
   /**
-   * @brief Returns the AST to apply on Column chunk statistics.
+   * @brief Returns the converted AST expression
    *
    * @return AST operation expression
    */
   [[nodiscard]] std::optional<std::reference_wrapper<ast::expression const>> get_converted_expr()
     const
   {
-    return _stats_expr;
+    return _converted_expr;
   }
 
  private:
@@ -428,7 +484,7 @@ class named_to_reference_converter : public ast::detail::expression_transformer
     cudf::host_span<std::reference_wrapper<ast::expression const> const> operands);
 
   std::unordered_map<std::string, size_type> column_name_to_index;
-  std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
+  std::optional<std::reference_wrapper<ast::expression const>> _converted_expr;
   // Using std::list or std::deque to avoid reference invalidation
   std::list<ast::column_reference> _col_ref;
   std::list<ast::operation> _operators;
@@ -445,4 +501,22 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::optional<std::reference_wrapper<ast::expression const>> expr,
   std::vector<std::string> const& skip_names);
 
+/**
+ * @brief Filter table using the provided (StatsAST or BloomfilterAST) expression and
+ * collect filtered row group indices
+ *
+ * @param table Table of stats or bloom filter membership columns
+ * @param ast_expr StatsAST or BloomfilterAST expression to filter with
+ * @param input_row_group_indices Lists of input row groups to read, one per source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Collected filtered row group indices, one vector per source, if any. A std::nullopt if
+ * all row groups are required or if the computed predicate is all nulls
+ */
+[[nodiscard]] std::optional<std::vector<std::vector<size_type>>> collect_filtered_row_group_indices(
+  cudf::table_view ast_table,
+  std::reference_wrapper<ast::expression const> ast_expr,
+  host_span<std::vector<size_type> const> input_row_group_indices,
+  rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 326232ced60..43666f9e42d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1286,7 +1286,8 @@ void reader::impl::preprocess_file(read_mode mode)
            _file_itm_data.global_num_rows,
            _file_itm_data.row_groups,
            _file_itm_data.num_rows_per_source) =
-    _metadata->select_row_groups(_options.row_group_indices,
+    _metadata->select_row_groups(_sources,
+                                 _options.row_group_indices,
                                  _options.skip_rows,
                                  _options.num_rows,
                                  output_dtypes,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 77924ac0f35..1b67b53ae8e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -23,8 +23,7 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "interop/decimal_conversion_utilities.cuh"
-#include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
+#include "io/comp/comp.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -67,6 +66,20 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
                             host_span<std::map<std::string, std::string> const> kv_md,
@@ -1172,7 +1185,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
                      bool write_v2_headers,
-                     Compression compression_codec,
+                     compression_type compression,
                      rmm::cuda_stream_view stream)
 {
   if (chunks.is_empty()) { return cudf::detail::hostdevice_vector<size_type>{}; }
@@ -1187,7 +1200,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1212,7 +1225,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1221,12 +1234,10 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
 
   // Get per-page max compressed size
   cudf::detail::hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
-  std::transform(page_sizes.begin(),
-                 page_sizes.end(),
-                 comp_page_sizes.begin(),
-                 [compression_codec](auto page_size) {
-                   return max_compression_output_size(compression_codec, page_size);
-                 });
+  std::transform(
+    page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [compression](auto page_size) {
+      return max_compressed_size(compression, page_size);
+    });
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
@@ -1238,7 +1249,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1247,16 +1258,13 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
   return comp_page_sizes;
 }
 
-size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
+size_t max_page_bytes(compression_type compression, size_t max_page_size_bytes)
 {
-  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == compression_type::NONE) { return max_page_size_bytes; }
 
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
-  auto max_size = std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
+  auto max_size = std::min(comp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
   // page size must fit in a 32-bit signed integer
   return std::min<size_t>(max_size, std::numeric_limits<int32_t>::max());
 }
@@ -1265,7 +1273,7 @@ std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_u
 build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
                          host_span<parquet_column_device_view const> col_desc,
                          device_2dspan<PageFragment const> frags,
-                         Compression compression,
+                         compression_type compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
                          rmm::cuda_stream_view stream)
@@ -1404,7 +1412,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
  * @param num_columns Total number of columns
  * @param num_pages Total number of pages
  * @param num_stats_bfr Number of statistics buffers
- * @param compression Compression format
+ * @param alignment Page alignment
  * @param max_page_size_bytes Maximum uncompressed page size, in bytes
  * @param max_page_size_rows Maximum page size, in rows
  * @param write_v2_headers True if version 2 page headers are to be written
@@ -1419,7 +1427,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                         uint32_t num_columns,
                         uint32_t num_pages,
                         uint32_t num_stats_bfr,
-                        Compression compression,
+                        size_t alignment,
                         size_t max_page_size_bytes,
                         size_type max_page_size_rows,
                         bool write_v2_headers,
@@ -1435,7 +1443,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression),
+                   alignment,
                    write_v2_headers,
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
@@ -1478,7 +1486,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
                   std::optional<writer_compression_statistics>& comp_stats,
-                  Compression compression,
+                  compression_type compression,
                   int32_t column_index_truncate_length,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
@@ -1488,7 +1496,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                            ? device_span<statistics_chunk const>(page_stats, num_pages)
                            : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
+  uint32_t max_comp_pages = (compression != compression_type::NONE) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1499,34 +1507,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                compression_result{0, compression_status::FAILURE});
 
   EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
-  switch (compression) {
-    case Compression::SNAPPY:
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-      break;
-    case Compression::ZSTD: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::LZ4_RAW: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::UNCOMPRESSED: break;
-    default: CUDF_FAIL("invalid compression type");
-  }
+  compress(compression, comp_in, comp_out, comp_res, stream);
 
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
@@ -1744,7 +1725,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_type max_page_size_rows,
                                    int32_t column_index_truncate_length,
                                    statistics_freq stats_granularity,
-                                   Compression compression,
+                                   compression_type compression,
                                    bool collect_compression_statistics,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
@@ -2146,7 +2127,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == compression_type::NONE) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers
   uint32_t const num_stats_bfr =
@@ -2214,7 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                        num_columns,
                        num_pages,
                        num_stats_bfr,
-                       compression,
+                       compress_required_chunk_alignment(compression),
                        max_page_size_bytes,
                        max_page_size_rows,
                        write_v2_headers,
@@ -2270,7 +2251,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         auto const dev_bfr      = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr;
         auto& column_chunk_meta = row_group.columns[i].meta_data;
 
-        if (ck.is_compressed) { column_chunk_meta.codec = compression; }
+        if (ck.is_compressed) { column_chunk_meta.codec = to_parquet_compression(compression); }
         if (!out_sink[p]->is_device_write_preferred(ck.compressed_size)) {
           all_device_write = false;
         }
@@ -2375,7 +2356,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
@@ -2406,7 +2387,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 63128faf993..d5a5a534b93 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,7 +144,7 @@ class writer::impl {
   rmm::cuda_stream_view _stream;
 
   // Writer options.
-  Compression const _compression;
+  compression_type const _compression;
   size_t const _max_row_group_size;
   size_type const _max_row_group_rows;
   size_t const _max_page_size_bytes;
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index f15ea1f3c37..ede788c97c2 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #include "writer_impl_helpers.hpp"
 
-#include "io/comp/nvcomp_adapter.hpp"
-
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,48 +30,6 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
-{
-  switch (codec) {
-    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
-    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
-    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-uint32_t page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 void fill_table_meta(table_input_metadata& table_meta)
 {
   // Fill unnamed columns' names in table_meta
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 14a9a0ed5b7..b5c73c348fe 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,48 +20,12 @@
  */
 
 #pragma once
-#include "parquet_common.hpp"
 
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/io/detail/parquet.hpp>
-#include <cudf/io/nvcomp_adapter.hpp>
 
 namespace cudf::io::parquet::detail {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression);
-
-/**
- * @brief Function that translates the given compression codec to nvcomp compression type.
- *
- * @param codec Compression codec
- * @return Translated nvcomp compression type
- */
-cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
-
-/**
- * @brief Function that computes input alignment requirements for the given compression type.
- *
- * @param codec Compression codec
- * @return Required alignment
- */
-uint32_t page_alignment(Compression codec);
-
-/**
- * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
- *        batch.
- *
- * @param codec Compression codec
- * @param compression_blocksize Size of the largest uncompressed chunk in the batch
- * @return Maximum compressed chunk size
- */
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
-
 /**
  * @brief Fill the table metadata with default column names.
  *
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index cea0ebad8f5..726feca328b 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,8 +56,7 @@ void set_up_kvikio()
 {
   static std::once_flag flag{};
   std::call_once(flag, [] {
-    auto const compat_mode =
-      kvikio::detail::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON);
+    auto const compat_mode = kvikio::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON);
     kvikio::defaults::compat_mode_reset(compat_mode);
 
     auto const nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 4u);
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 75ebc078930..928625a7e8f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,6 +145,13 @@ struct reduce_dispatch_functor {
         auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction.");
+        return (*udf_ptr)(col, output_dtype, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported reduction operator");
     }
   }
@@ -161,9 +169,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   // Returns default scalar if input column is empty or all null
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1c3a2b0c0f3..5835bfcf0a1 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor {
       }
       case segmented_reduce_aggregation::NUNIQUE:
         return segmented_nunique(col, offsets, null_handling, stream, mr);
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<segmented_reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction.");
+        return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
@@ -117,9 +126,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   if (segmented_values.is_empty() && offsets.empty()) {
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 997b0278fe2..33d52ccd570 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
+#include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -315,6 +315,7 @@ namespace {
  */
 CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
                                              cudf::size_type ngrams,
+                                             uint32_t seed,
                                              cudf::size_type const* d_ngram_offsets,
                                              cudf::hash_value_type* d_results)
 {
@@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
   __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
 
   auto const ngram_offset = d_ngram_offsets[str_idx];
-  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seed};
 
   auto const end        = d_str.data() + d_str.size_bytes();
   auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
@@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
@@ -400,7 +402,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
   character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, ngrams, d_offsets, d_hashes);
+    *d_strings, ngrams, seed, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
@@ -419,11 +421,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
+  return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 344979e1288..6a89b1e48d6 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -220,11 +220,12 @@ ConfigureTest(
   REDUCTIONS_TEST
   reductions/collect_ops_tests.cpp
   reductions/ewm_tests.cpp
+  reductions/host_udf_example_tests.cu
+  reductions/list_rank_test.cpp
   reductions/rank_tests.cpp
   reductions/reduction_tests.cpp
   reductions/scan_tests.cpp
   reductions/segmented_reduction_tests.cpp
-  reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
   GPUS 1
   PERCENT 70
@@ -317,14 +318,15 @@ ConfigureTest(
 )
 ConfigureTest(
   PARQUET_TEST
-  io/parquet_test.cpp
+  io/parquet_bloom_filter_test.cu
   io/parquet_chunked_reader_test.cu
   io/parquet_chunked_writer_test.cpp
   io/parquet_common.cpp
   io/parquet_misc_test.cpp
   io/parquet_reader_test.cpp
-  io/parquet_writer_test.cpp
+  io/parquet_test.cpp
   io/parquet_v2_test.cpp
+  io/parquet_writer_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu
index a454bd692fc..e1ded37d8a7 100644
--- a/cpp/tests/groupby/host_udf_example_tests.cu
+++ b/cpp/tests/groupby/host_udf_example_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/reduction.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -34,6 +32,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+
 namespace {
 /**
  * @brief A host-based UDF implementation for groupby.
@@ -41,42 +42,21 @@ namespace {
  * For each group of values, the aggregation computes
  * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`.
  */
-struct host_udf_groupby_example : cudf::host_udf_base {
+struct host_udf_groupby_example : cudf::groupby_host_udf {
   host_udf_groupby_example() = default;
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override
-  {
-    // We need grouped values, group offsets, group labels, and also results from groups'
-    // MAX and SUM aggregations.
-    return {groupby_data_attribute::GROUPED_VALUES,
-            groupby_data_attribute::GROUP_OFFSETS,
-            groupby_data_attribute::GROUP_LABELS,
-            cudf::make_max_aggregation<cudf::groupby_aggregation>(),
-            cudf::make_sum_aggregation<cudf::groupby_aggregation>()};
-  }
-
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
-    [[maybe_unused]] rmm::cuda_stream_view stream,
-    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
+    rmm::cuda_stream_view, rmm::device_async_resource_ref) const override
   {
     return cudf::make_empty_column(
       cudf::data_type{cudf::type_to_id<typename groupby_fn::OutputType>()});
   }
 
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
   {
-    auto const& values =
-      std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-    return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr);
-  }
-
-  [[nodiscard]] std::size_t do_hash() const override
-  {
-    // Just return the same hash for all instances of this class.
-    return std::size_t{12345};
+    auto const values = get_grouped_values();
+    return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr);
   }
 
   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
@@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
   struct groupby_fn {
     // Store pointer to the parent class so we can call its functions.
-    host_udf_groupby_example const* parent;
+    host_udf_groupby_example const& parent;
 
-    // For simplicity, this example only accepts double input and always produces double output.
+    // For simplicity, this example only accepts a single type input and output.
     using InputType  = double;
     using OutputType = double;
 
     template <typename T, typename... Args, CUDF_ENABLE_IF(!std::is_same_v<InputType, T>)>
-    output_t operator()(Args...) const
+    std::unique_ptr<cudf::column> operator()(Args...) const
     {
       CUDF_FAIL("Unsupported input type.");
     }
 
     template <typename T, CUDF_ENABLE_IF(std::is_same_v<InputType, T>)>
-    output_t operator()(input_map_t const& input,
-                        rmm::cuda_stream_view stream,
-                        rmm::device_async_resource_ref mr) const
+    std::unique_ptr<cudf::column> operator()(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr) const
     {
-      auto const& values =
-        std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-      if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); }
+      auto const values = parent.get_grouped_values();
+      if (values.size() == 0) { return parent.get_empty_output(stream, mr); }
 
-      auto const offsets = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_OFFSETS));
+      auto const offsets = parent.get_group_offsets();
       CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
       auto const num_groups    = static_cast<int>(offsets.size()) - 1;
-      auto const group_indices = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_LABELS));
-      auto const group_max = std::get<cudf::column_view>(
-        input.at(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
-      auto const group_sum = std::get<cudf::column_view>(
-        input.at(cudf::make_sum_aggregation<cudf::groupby_aggregation>()));
+      auto const group_indices = parent.get_group_labels();
+      auto const group_max =
+        parent.compute_aggregation(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      auto const group_sum =
+        parent.compute_aggregation(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
       auto const values_dv_ptr = cudf::column_device_view::create(values, stream);
       auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<OutputType>()},
@@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
 }  // namespace
 
-using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
-using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
-
 struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {};
 
 TEST_F(HostUDFGroupbyExampleTest, SimpleInput)
diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp
index 1a0f68c0c6c..17da28cdefc 100644
--- a/cpp/tests/groupby/host_udf_tests.cpp
+++ b/cpp/tests/groupby/host_udf_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,178 +26,121 @@
 #include <vector>
 
 namespace {
+
 /**
- * @brief A host-based UDF implementation used for unit tests.
+ * @brief Generate a random aggregation object from {min, max, sum, product}.
  */
-struct host_udf_test_base : cudf::host_udf_base {
+std::unique_ptr<cudf::aggregation> get_random_agg()
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> distr(1, 4);
+  switch (distr(gen)) {
+    case 1: return cudf::make_min_aggregation();
+    case 2: return cudf::make_max_aggregation();
+    case 3: return cudf::make_sum_aggregation();
+    case 4: return cudf::make_product_aggregation();
+    default: CUDF_UNREACHABLE("This should not be reached.");
+  }
+  return nullptr;
+}
+
+/**
+ * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
+ */
+struct host_udf_groupby_test : cudf::groupby_host_udf {
   int test_location_line;  // the location where testing is called
   bool* test_run;          // to check if the test is accidentally skipped
-  data_attribute_set_t input_attrs;
+  bool test_other_agg;     // test calling other aggregation
 
-  host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_)
-    : test_location_line{test_location_line_},
-      test_run{test_run_},
-      input_attrs(std::move(input_attrs_))
+  host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_)
+    : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_}
   {
   }
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; }
-
-  // This is the main testing function, which checks for the correctness of input data.
-  // The rests are just to satisfy the interface.
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::size_t do_hash() const override { return 0; }
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
   {
-    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
-
-    test_data_attributes(input, stream, mr);
-
-    *test_run = true;  // test is run successfully
-    return get_empty_output(std::nullopt, stream, mr);
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_groupby_test const*>(&other) != nullptr;
+  }
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, test_other_agg);
   }
 
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
     [[maybe_unused]] rmm::cuda_stream_view stream,
     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
   {
-    // Unused function - dummy output.
+    // Dummy output.
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
   }
 
-  [[nodiscard]] std::size_t do_hash() const override { return 0; }
-  [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; }
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
+  {
+    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
 
-  // The main test function, which must be implemented for each kind of aggregations
-  // (groupby/reduction/segmented_reduction).
-  virtual void test_data_attributes(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const = 0;
-};
+    // Perform tests on types for the groupby data: we must ensure the data corresponding to each
+    // `groupby_data` enum having the correct type.
 
-/**
- * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
- */
-struct host_udf_groupby_test : host_udf_test_base {
-  host_udf_groupby_test(int test_location_line_,
-                        bool* test_run_,
-                        data_attribute_set_t input_attrs_ = {})
-    : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_))
-  {
-  }
+    {
+      auto const inp_data = get_input_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
-  {
-    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, input_attrs);
-  }
+    {
+      auto const inp_data = get_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  void test_data_attributes(input_map_t const& input,
-                            rmm::cuda_stream_view stream,
-                            rmm::device_async_resource_ref mr) const override
-  {
-    data_attribute_set_t check_attrs = input_attrs;
-    if (check_attrs.empty()) {
-      check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES,
-                                         groupby_data_attribute::GROUPED_VALUES,
-                                         groupby_data_attribute::SORTED_GROUPED_VALUES,
-                                         groupby_data_attribute::NUM_GROUPS,
-                                         groupby_data_attribute::GROUP_OFFSETS,
-                                         groupby_data_attribute::GROUP_LABELS};
+    {
+      auto const inp_data = get_sorted_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
     }
-    EXPECT_EQ(input.size(), check_attrs.size());
-    for (auto const& attr : check_attrs) {
-      EXPECT_TRUE(input.count(attr) > 0);
-      EXPECT_TRUE(std::holds_alternative<groupby_data_attribute>(attr.value) ||
-                  std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value));
-      if (std::holds_alternative<groupby_data_attribute>(attr.value)) {
-        switch (std::get<groupby_data_attribute>(attr.value)) {
-          case groupby_data_attribute::INPUT_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::SORTED_GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::NUM_GROUPS:
-            EXPECT_TRUE(std::holds_alternative<cudf::size_type>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_OFFSETS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_LABELS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          default:;
-        }
-      } else {  // std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value)
-        EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-      }
+
+    {
+      auto const inp_data = get_num_groups();
+      EXPECT_TRUE((std::is_same_v<cudf::size_type, std::decay_t<decltype(inp_data)>>));
     }
-  }
-};
 
-/**
- * @brief Get a random subset of input data attributes.
- */
-cudf::host_udf_base::data_attribute_set_t get_subset(
-  cudf::host_udf_base::data_attribute_set_t const& attrs)
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<std::size_t> size_distr(1, attrs.size() - 1);
-  auto const subset_size = size_distr(gen);
-  auto const elements =
-    std::vector<cudf::host_udf_base::data_attribute>(attrs.begin(), attrs.end());
-  std::uniform_int_distribution<std::size_t> idx_distr(0, attrs.size() - 1);
-  cudf::host_udf_base::data_attribute_set_t output;
-  while (output.size() < subset_size) {
-    output.insert(elements[idx_distr(gen)]);
-  }
-  return output;
-}
+    {
+      auto const inp_data = get_group_offsets();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
 
-/**
- * @brief Generate a random aggregation object from {min, max, sum, product}.
- */
-std::unique_ptr<cudf::aggregation> get_random_agg()
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<int> distr(1, 4);
-  switch (distr(gen)) {
-    case 1: return cudf::make_min_aggregation();
-    case 2: return cudf::make_max_aggregation();
-    case 3: return cudf::make_sum_aggregation();
-    case 4: return cudf::make_product_aggregation();
-    default: CUDF_UNREACHABLE("This should not be reached.");
+    {
+      auto const inp_data = get_group_labels();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
+
+    // Perform tests on type of the result from computing other aggregations.
+    if (test_other_agg) {
+      auto const inp_data = compute_aggregation(get_random_agg());
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
+
+    *test_run = true;  // test is run successfully
+    return get_empty_output(stream, mr);
   }
-  return nullptr;
-}
+};
 
 }  // namespace
 
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 
-// Number of randomly testing on the input data attributes.
-// For each test, a subset of data attributes will be randomly generated from all the possible input
-// data attributes. The input data corresponding to that subset passed from libcudf will be tested
-// for correctness.
-constexpr int NUM_RANDOM_TESTS = 20;
-
 struct HostUDFTest : cudf::test::BaseFixture {};
 
-TEST_F(HostUDFTest, GroupbyAllInput)
+TEST_F(HostUDFTest, GroupbyBuiltinInput)
 {
   bool test_run   = false;
   auto const keys = int32s_col{0, 1, 2};
   auto const vals = int32s_col{0, 1, 2};
   auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run));
+    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ false));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back();
@@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput)
   requests[0].aggregations.push_back(std::move(agg));
   cudf::groupby::groupby gb_obj(
     cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-  [[maybe_unused]] auto const grp_result =
-    gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+    requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_TRUE(test_run);
 }
 
-TEST_F(HostUDFTest, GroupbySomeInput)
+TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations)
 {
-  auto const keys      = int32s_col{0, 1, 2};
-  auto const vals      = int32s_col{0, 1, 2};
-  auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{
-    cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  auto const keys = int32s_col{0, 1, 2};
+  auto const vals = int32s_col{0, 1, 2};
+
+  constexpr int NUM_RANDOM_TESTS = 20;
+
   for (int i = 0; i < NUM_RANDOM_TESTS; ++i) {
-    bool test_run    = false;
-    auto input_attrs = get_subset(all_attrs);
-    input_attrs.insert(get_random_agg());
-    auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, std::move(input_attrs)));
+    bool test_run = false;
+    auto agg      = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ true));
 
     std::vector<cudf::groupby::aggregation_request> requests;
     requests.emplace_back();
@@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput)
     requests[0].aggregations.push_back(std::move(agg));
     cudf::groupby::groupby gb_obj(
       cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-    [[maybe_unused]] auto const grp_result =
-      gb_obj.aggregate(requests, cudf::test::get_default_stream());
+    [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+      requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     EXPECT_TRUE(test_run);
   }
 }
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 2209a30149d..708c2045a74 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2068,6 +2068,7 @@ TEST_P(OrcCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(OrcCompressionTest,
                         OrcCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));
diff --git a/cpp/tests/io/parquet_bloom_filter_test.cu b/cpp/tests/io/parquet_bloom_filter_test.cu
new file mode 100644
index 00000000000..d858f58fa56
--- /dev/null
+++ b/cpp/tests/io/parquet_bloom_filter_test.cu
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_64.cuh>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <cuco/bloom_filter.cuh>
+#include <cuco/bloom_filter_policies.cuh>
+
+using StringType = cudf::string_view;
+
+class ParquetBloomFilterTest : public cudf::test::BaseFixture {};
+
+TEST_F(ParquetBloomFilterTest, TestStrings)
+{
+  using key_type    = StringType;
+  using policy_type = cuco::arrow_filter_policy<key_type, cudf::hashing::detail::XXHash_64>;
+  using word_type   = policy_type::word_type;
+
+  std::size_t constexpr num_filter_blocks = 4;
+  auto stream                             = cudf::get_default_stream();
+
+  // strings keys to insert
+  auto keys = cudf::test::strings_column_wrapper(
+    {"seventh",    "fifteenth",   "second",      "tenth",      "fifth",       "first",
+     "seventh",    "tenth",       "ninth",       "ninth",      "seventeenth", "eighteenth",
+     "thirteenth", "fifth",       "fourth",      "twelfth",    "second",      "second",
+     "fourth",     "seventh",     "seventh",     "tenth",      "thirteenth",  "seventeenth",
+     "fifth",      "seventeenth", "eighth",      "fourth",     "second",      "eighteenth",
+     "fifteenth",  "second",      "seventeenth", "thirteenth", "eighteenth",  "fifth",
+     "seventh",    "tenth",       "fourteenth",  "first",      "fifth",       "fifth",
+     "tenth",      "thirteenth",  "fourteenth",  "third",      "third",       "sixth",
+     "first",      "third"});
+
+  auto d_keys = cudf::column_device_view::create(keys);
+
+  // Spawn a bloom filter
+  cuco::bloom_filter<key_type,
+                     cuco::extent<size_t>,
+                     cuda::thread_scope_device,
+                     policy_type,
+                     cudf::detail::cuco_allocator<char>>
+    filter{num_filter_blocks,
+           cuco::thread_scope_device,
+           {{cudf::DEFAULT_HASH_SEED}},
+           cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+           stream};
+
+  // Add strings to the bloom filter
+  filter.add(d_keys->begin<key_type>(), d_keys->end<key_type>(), stream);
+
+  // Number of words in the filter
+  cudf::size_type const num_words = filter.block_extent() * filter.words_per_block;
+
+  // Filter bitset as a column
+  auto const bitset = cudf::column_view{
+    cudf::data_type{cudf::type_id::UINT32}, num_words, filter.data(), nullptr, 0, 0, {}};
+
+  // Expected filter bitset words computed using Arrow's implementation here:
+  // https://godbolt.org/z/oKfqcPWbY
+  auto expected = cudf::test::fixed_width_column_wrapper<word_type>(
+    {4194306U,    4194305U,    2359296U,  1073774592U, 524544U,    1024U,      268443648U,
+     8519680U,    2147500040U, 8421380U,  269500416U,  4202624U,   8396802U,   100665344U,
+     2147747840U, 5243136U,    131146U,   655364U,     285345792U, 134222340U, 545390596U,
+     2281717768U, 51201U,      41943553U, 1619656708U, 67441680U,  8462730U,   361220U,
+     2216738864U, 587333888U,  4219272U,  873463873U});
+
+  // Check the bitset for equality
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(bitset, expected);
+}
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index d66f685cd9c..419ac909ac6 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -268,6 +268,7 @@ TEST_P(ParquetCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(ParquetCompressionTest,
                         ParquetCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));
diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu
new file mode 100644
index 00000000000..67b88c5306b
--- /dev/null
+++ b/cpp/tests/reductions/host_udf_example_tests.cu
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+namespace {
+/**
+ * @brief A host-based UDF implementation for reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_reduction_example : cudf::reduce_host_udf {
+  host_udf_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+    cudf::column_view const& input,
+    cudf::data_type output_dtype,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(
+      input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_reduction_example>();
+  }
+
+  struct reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::scalar> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+      cudf::column_view const& input,
+      cudf::data_type output_dtype,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      if (input.size() == 0) {
+        return cudf::make_default_constructed_scalar(output_dtype, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto const result       = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                   thrust::make_counting_iterator(0),
+                                                   thrust::make_counting_iterator(input.size()),
+                                                   transform_fn{*input_dv_ptr},
+                                                   static_cast<OutputType>(init_value),
+                                                   thrust::plus<>{});
+
+      auto output = cudf::make_numeric_scalar(output_dtype, stream, mr);
+      static_cast<cudf::scalar_type_t<OutputType>*>(output.get())->set_value(result, stream);
+      return output;
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      OutputType __device__ operator()(cudf::size_type idx) const
+      {
+        if (values.is_null(idx)) { return OutputType{0}; }
+        auto const val = static_cast<OutputType>(values.element<InputType>(idx));
+        return val * val;
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFReductionExampleTest, SimpleInput)
+{
+  auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_TRUE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+  auto const result =
+    static_cast<cudf::scalar_type_t<int64_t>*>(reduced.get())->value(cudf::get_default_stream());
+  auto constexpr expected = 55;  // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55
+  EXPECT_EQ(expected, result);
+}
+
+TEST_F(HostUDFReductionExampleTest, EmptyInput)
+{
+  auto const vals = doubles_col{};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_FALSE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+}
+
+namespace {
+
+/**
+ * @brief A host-based UDF implementation for segmented reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf {
+  host_udf_segmented_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    cudf::column_view const& input,
+    cudf::device_span<cudf::size_type const> offsets,
+    cudf::data_type output_dtype,
+    cudf::null_policy null_handling,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(input.type(),
+                                        output_dtype,
+                                        segmented_reduce_fn{},
+                                        input,
+                                        offsets,
+                                        output_dtype,
+                                        null_handling,
+                                        init,
+                                        stream,
+                                        mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_segmented_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_segmented_reduction_example>();
+  }
+
+  struct segmented_reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(
+      cudf::column_view const& input,
+      cudf::device_span<cudf::size_type const> offsets,
+      cudf::data_type output_dtype,
+      cudf::null_policy null_handling,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
+      auto const num_segments = static_cast<cudf::size_type>(offsets.size()) - 1;
+
+      if (input.size() == 0) {
+        if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); }
+        return cudf::make_numeric_column(
+          output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto output             = cudf::make_numeric_column(
+        output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream);
+
+      // Store row index if it is valid, otherwise store a negative value denoting a null row.
+      rmm::device_uvector<cudf::size_type> valid_idx(num_segments, stream);
+
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_segments),
+        thrust::make_zip_iterator(output->mutable_view().begin<OutputType>(), valid_idx.begin()),
+        transform_fn{*input_dv_ptr, offsets, static_cast<OutputType>(init_value), null_handling});
+
+      auto const valid_idx_cv = cudf::column_view{
+        cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0};
+      return std::move(cudf::gather(cudf::table_view{{output->view()}},
+                                    valid_idx_cv,
+                                    cudf::out_of_bounds_policy::NULLIFY,
+                                    stream,
+                                    mr)
+                         ->release()
+                         .front());
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      cudf::device_span<cudf::size_type const> offsets;
+      OutputType init_value;
+      cudf::null_policy null_handling;
+
+      thrust::tuple<OutputType, cudf::size_type> __device__ operator()(cudf::size_type idx) const
+      {
+        auto const start = offsets[idx];
+        auto const end   = offsets[idx + 1];
+
+        auto constexpr invalid_idx = cuda::std::numeric_limits<cudf::size_type>::lowest();
+        if (start == end) { return {OutputType{0}, invalid_idx}; }
+
+        auto sum = init_value;
+        for (auto i = start; i < end; ++i) {
+          if (values.is_null(i)) {
+            if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; }
+            continue;
+          }
+          auto const val = static_cast<OutputType>(values.element<InputType>(i));
+          sum += val * val;
+        }
+        auto const segment_size = end - start;
+        return {static_cast<OutputType>(segment_size) * sum, idx};
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput)
+{
+  double constexpr null = 0.0;
+  auto const vals       = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0},
+                                      {true, false, true, true, false, true, false, false, true, true}};
+  auto const offsets    = int32s_col{0, 3, 5, 10}.release();
+  auto const agg        = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+
+  // Test without init value.
+  {
+    auto const result = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      std::nullopt,  // init value
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // Since init value is not given, it is set to 0.
+    // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{12, 18, 850}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and include nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{48, 42, 955}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and exclude nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::EXCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{21, 24, 865}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{0, 0, 0, 0}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{{0, 0, 0}, {false, false, false}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
index 221c0a62f3e..47b9ac46d12 100644
--- a/cpp/tests/streams/text/ngrams_test.cpp
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams)
   auto input =
     cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
   nvtext::hash_character_ngrams(
-    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+    cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream());
 }
 
 TEST_F(TextNGramsTest, NgramsTokenize)
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c72c7cfc80e..1a737231389 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash)
                     2319357747u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::hash_character_ngrams(view, 10, 10);
+  // clang-format off
+  LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u,
+                     2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u},
+                 LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u,
+                     3759561568u, 1092423314u,  339538635u, 4265577390u,  879551618u,
+                     4222824617u, 1774528854u, 1028254379u,  485918316u,  879142987u, 3619248543u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 2276b223740..c07a58ed8a5 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 27966ddfdd4..234a9ec1ced 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
index 0b6ecf2e140..124f2c99188 100644
--- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
+++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
@@ -24,8 +24,10 @@
  * <p>
  * A new host UDF aggregation implementation must extend this class and override the
  * {@code hashCode} and {@code equals} methods for such purposes.
+ * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must
+ * also be overridden to automatically delete the native UDF instance upon class destruction.
  */
-public abstract class HostUDFWrapper {
+public abstract class HostUDFWrapper implements AutoCloseable {
   public final long udfNativeHandle;
 
   public HostUDFWrapper(long udfNativeHandle) {
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index ba8ae379bae..4f047a68f06 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -161,14 +161,14 @@ public static ReductionAggregation median() {
   /**
    * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
    */
-  public static ReductionAggregation quantile(double ... quantiles) {
+  public static ReductionAggregation quantile(double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(quantiles));
   }
 
   /**
    * Aggregate to compute various quantiles.
    */
-  public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) {
+  public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(method, quantiles));
   }
 
@@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() {
    * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
    */
   public static ReductionAggregation collectSet(NullPolicy nullPolicy,
-      NullEquality nullEquality, NaNEquality nanEquality) {
+                                                NullEquality nullEquality, NaNEquality nanEquality) {
     return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
   }
 
@@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new ReductionAggregation instance
+   */
+  public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new ReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
+
   /**
    * Create HistogramAggregation, computing the frequencies for each unique row.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
index 7ed150a2fec..18e7d874886 100644
--- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() {
   public static SegmentedReductionAggregation all() {
     return new SegmentedReductionAggregation(Aggregation.all());
   }
+
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new SegmentedReductionAggregation instance
+   */
+  public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
 }
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 30da8727366..19f2802553d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2413,7 +2413,11 @@ def as_column(
                 and pa.types.is_integer(arbitrary.type)
                 and arbitrary.null_count > 0
             ):
+                # TODO: Need to re-visit this cast and fill_null
+                # calls while addressing the following issue:
+                # https://github.com/rapidsai/cudf/issues/14149
                 arbitrary = arbitrary.cast(pa.float64())
+                arbitrary = pc.fill_null(arbitrary, np.nan)
             if (
                 cudf.get_option("default_integer_bitwidth")
                 and pa.types.is_integer(arbitrary.type)
diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet
new file mode 100644
index 00000000000..4123545a6e0
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet differ
diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet
new file mode 100644
index 00000000000..7dc2cee21ae
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet differ
diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet
new file mode 100644
index 00000000000..e898f1d7d1b
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet differ
diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet
new file mode 100644
index 00000000000..3060234d499
Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet differ
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 77d1f77d30b..9d5f32c7ab9 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import datetime
+import decimal
 import glob
 import hashlib
 import math
@@ -4370,3 +4371,57 @@ def test_parquet_reader_mismatched_nullability_structs(tmpdir):
         cudf.read_parquet([buf2, buf1]),
         cudf.concat([df2, df1]).reset_index(drop=True),
     )
+
+
+@pytest.mark.parametrize(
+    "stats_fname,bloom_filter_fname",
+    [
+        (
+            "mixed_card_ndv_100_chunk_stats.snappy.parquet",
+            "mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet",
+        ),
+        (
+            "mixed_card_ndv_500_chunk_stats.snappy.parquet",
+            "mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet",
+        ),
+    ],
+)
+@pytest.mark.parametrize(
+    "predicate,expected_len",
+    [
+        ([[("str", "==", "FINDME")], [("fp64", "==", float(500))]], 2),
+        ([("fixed_pt", "==", decimal.Decimal(float(500)))], 2),
+        ([[("ui32", "==", np.uint32(500)), ("str", "==", "FINDME")]], 2),
+        ([[("str", "==", "FINDME")], [("ui32", ">=", np.uint32(0))]], 1000),
+        (
+            [
+                ("str", "!=", "FINDME"),
+                ("fixed_pt", "==", decimal.Decimal(float(500))),
+            ],
+            0,
+        ),
+    ],
+)
+def test_parquet_bloom_filters(
+    datadir, stats_fname, bloom_filter_fname, predicate, expected_len
+):
+    fname_stats = datadir / stats_fname
+    fname_bf = datadir / bloom_filter_fname
+    df_stats = cudf.read_parquet(fname_stats, filters=predicate).reset_index(
+        drop=True
+    )
+    df_bf = cudf.read_parquet(fname_bf, filters=predicate).reset_index(
+        drop=True
+    )
+
+    # Check if tables equal
+    assert_eq(
+        df_stats,
+        df_bf,
+    )
+
+    # Check for table length
+    assert_eq(
+        len(df_stats),
+        expected_len,
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f8697c5c6b8..891c0ede9a4 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 import datetime
 import decimal
 import hashlib
@@ -3003,3 +3003,12 @@ def test_dtype_dtypes_equal():
     ser = cudf.Series([0])
     assert ser.dtype is ser.dtypes
     assert ser.dtypes is ser.to_pandas().dtypes
+
+
+def test_null_like_to_nan_pandas_compat():
+    with cudf.option_context("mode.pandas_compatible", True):
+        ser = cudf.Series([1, 2, np.nan, 10, None])
+        pser = pd.Series([1, 2, np.nan, 10, None])
+
+        assert pser.dtype == ser.dtype
+        assert_eq(ser, pser)
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c16df320ceb..e453a8b89b9 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -8,7 +8,9 @@
 from functools import partialmethod
 from typing import TYPE_CHECKING
 
+import fastexcel
 import pytest
+from packaging import version
 
 import polars
 
@@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None:
     )
 
 
-EXPECTED_FAILURES: Mapping[str, str] = {
+EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
@@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None:
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+    "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
+        "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
+        version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
+    ),
 }
 
 
@@ -219,4 +225,12 @@ def pytest_collection_modifyitems(
         if item.nodeid in TESTS_TO_SKIP:
             item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
         elif item.nodeid in EXPECTED_FAILURES:
+            if isinstance(EXPECTED_FAILURES[item.nodeid], tuple):
+                # the second entry in the tuple is the condition to xfail on
+                item.add_marker(
+                    pytest.mark.xfail(
+                        condition=EXPECTED_FAILURES[item.nodeid][1],
+                        reason=EXPECTED_FAILURES[item.nodeid][0],
+                    ),
+                )
             item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))