From 231015910bae375077e07c01d2bf70697182ccad Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Thu, 9 Jan 2025 16:53:25 -0500
Subject: [PATCH 01/12] Support multithreaded reading of compressed buffers in
 JSON reader (#17670)

Addresses #17638

This PR introduces multithreaded host-side decompression of compressed input buffers passed to the JSON reader, and uses a stream pool to transfer the uncompressed buffers to device.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/17670
---
 cpp/src/io/json/read_json.cu | 71 +++++++++++++++++++++++++++++++-----
 1 file changed, 61 insertions(+), 10 deletions(-)
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 82d8152ca1c..113342e9cbf 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
@@ -37,12 +38,25 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
+#include <BS_thread_pool.hpp>
+#include <BS_thread_pool_utils.hpp>
+
 #include <numeric>
 
 namespace cudf::io::json::detail {
 
 namespace {
 
+namespace pools {
+
+BS::thread_pool& tpool()
+{
+  static BS::thread_pool _tpool(std::thread::hardware_concurrency());
+  return _tpool;
+}
+
+}  // namespace pools
+
 class compressed_host_buffer_source final : public datasource {
  public:
   explicit compressed_host_buffer_source(std::unique_ptr<datasource> const& src,
@@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource {
   {
     auto ch_buffer = host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(_dbuf_ptr->data()),
                                               _dbuf_ptr->size());
-    if (comptype == compression_type::GZIP || comptype == compression_type::ZIP ||
-        comptype == compression_type::SNAPPY) {
+    if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP ||
+        _comptype == compression_type::SNAPPY) {
       _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer);
     } else {
       _decompressed_buffer         = cudf::io::detail::decompress(_comptype, ch_buffer);
@@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource {
     return std::make_unique<non_owning_buffer>(_decompressed_buffer.data() + offset, count);
   }
 
-  [[nodiscard]] bool supports_device_read() const override { return false; }
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
+    auto& thread_pool = pools::tpool();
+    return thread_pool.submit_task([this, offset, size, dst, stream] {
+      auto hbuf = host_read(offset, size);
+      CUDF_CUDA_TRY(
+        cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value()));
+      stream.synchronize();
+      return hbuf->size();
+    });
+  }
+
+  [[nodiscard]] bool supports_device_read() const override { return true; }
 
   [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; }
 
@@ -431,6 +460,8 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
+  std::vector<std::future<size_t>> thread_tasks;
+  auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count());
 
   auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
   std::vector<std::size_t> prefsum_source_sizes(sources.size());
@@ -447,13 +478,17 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
 
   auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
   range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-  for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
+  for (std::size_t i = start_source, cur_stream = 0;
+       i < sources.size() && bytes_read < total_bytes_to_read;
+       i++) {
     if (sources[i]->is_empty()) continue;
     auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
     auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
                        (num_delimiter_chars * delimiter_map.size());
-    if (sources[i]->is_device_read_preferred(data_size)) {
-      bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+    if (sources[i]->supports_device_read()) {
+      thread_tasks.emplace_back(sources[i]->device_read_async(
+        range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()]));
+      bytes_read += data_size;
     } else {
       h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
       auto const& h_buffer = h_buffers.back();
@@ -481,6 +516,15 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     buffer.data());
   }
   stream.synchronize();
+
+  if (thread_tasks.size()) {
+    auto const bytes_read = std::accumulate(
+      thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) {
+        return sum + task.get();
+      });
+    CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy");
+  }
+
   return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
 }
 
@@ -505,10 +549,17 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     return read_json_impl(sources, reader_opts, stream, mr);
 
   std::vector<std::unique_ptr<datasource>> compressed_sources;
-  for (size_t i = 0; i < sources.size(); i++) {
-    compressed_sources.emplace_back(
-      std::make_unique<compressed_host_buffer_source>(sources[i], reader_opts.get_compression()));
+  std::vector<std::future<std::unique_ptr<compressed_host_buffer_source>>> thread_tasks;
+  auto& thread_pool = pools::tpool();
+  for (auto& src : sources) {
+    thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] {
+      return std::make_unique<compressed_host_buffer_source>(src, reader_opts.get_compression());
+    }));
   }
+  std::transform(thread_tasks.begin(),
+                 thread_tasks.end(),
+                 std::back_inserter(compressed_sources),
+                 [](auto& task) { return task.get(); });
   // in read_json_impl, we need the compressed source size to actually be the
   // uncompressed source size for correct batching
   return read_json_impl(compressed_sources, reader_opts, stream, mr);

From a8a41975b0c1cfaedb7d4461ee027f6f9ff75b0e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 9 Jan 2025 14:16:04 -0800
Subject: [PATCH 02/12] Remove cudf._libs.types.pyx (#17665)

Contributes to https://github.com/rapidsai/cudf/issues/17317

1. Moves some Python routines/objects to `cudf/utils/dtypes.py`
2. Moves specific column only routines directly to `cudf/_libs/column.pyx`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17665
---
 python/cudf/cudf/_lib/CMakeLists.txt          |   4 +-
 python/cudf/cudf/_lib/column.pxd              |   4 +-
 python/cudf/cudf/_lib/column.pyx              | 100 ++++++++--
 python/cudf/cudf/_lib/scalar.pyx              |  49 ++---
 python/cudf/cudf/_lib/types.pxd               |  11 --
 python/cudf/cudf/_lib/types.pyx               | 172 ------------------
 python/cudf/cudf/core/_base_index.py          |   9 +-
 .../cudf/cudf/core/_internals/aggregation.py  |   4 +-
 python/cudf/cudf/core/_internals/binaryop.py  |   4 +-
 python/cudf/cudf/core/_internals/unary.py     |   4 +-
 python/cudf/cudf/core/column/categorical.py   |  10 +-
 python/cudf/cudf/core/column/column.py        |  27 +--
 python/cudf/cudf/core/column/lists.py         |   4 +-
 python/cudf/cudf/core/column/string.py        |  12 +-
 python/cudf/cudf/core/copy_types.py           |   6 +-
 python/cudf/cudf/core/dtypes.py               |   5 +-
 python/cudf/cudf/core/groupby/groupby.py      |  21 +--
 python/cudf/cudf/core/index.py                |   6 +-
 python/cudf/cudf/core/indexed_frame.py        |   3 +-
 python/cudf/cudf/core/join/join.py            |   6 +-
 python/cudf/cudf/core/multiindex.py           |  11 +-
 python/cudf/cudf/core/reshape.py              |   9 +-
 python/cudf/cudf/io/csv.py                    |   8 +-
 python/cudf/cudf/io/json.py                   |   8 +-
 python/cudf/cudf/io/orc.py                    |   4 +-
 python/cudf/cudf/utils/dtypes.py              |  66 ++++++-
 26 files changed, 251 insertions(+), 316 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/types.pxd
 delete mode 100644 python/cudf/cudf/_lib/types.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index ff6fba1c3e8..ec44a6aa8c5 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
+set(cython_sources column.pyx scalar.pyx strings_udf.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 8b1d16f0d85..026c12895e8 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport (
 from pylibcudf.libcudf.types cimport size_type
 from rmm.librmm.device_buffer cimport device_buffer
 
+cdef dtype_from_lists_column_view(column_view cv)
+cdef dtype_from_column_view(column_view cv)
 
 cdef class Column:
     cdef public:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f7dcd89ea48..c59bbc0f40c 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 
 from typing import Literal
@@ -19,24 +19,21 @@ from cudf.core.buffer import (
     as_buffer,
     cuda_array_interface_wrapper,
 )
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.utils.dtypes import (
+    _get_base_dtype,
+    dtype_to_pylibcudf_type,
+    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
+)
 
 from cpython.buffer cimport PyObject_CheckBuffer
-from libc.stdint cimport uintptr_t
-from libcpp.memory cimport make_unique, unique_ptr
+from libc.stdint cimport uintptr_t, int32_t
+from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
-from cudf._lib.types cimport (
-    dtype_from_column_view,
-    dtype_to_pylibcudf_type,
-)
-
-from cudf._lib.types import dtype_from_pylibcudf_column
-
-from pylibcudf cimport DataType as plc_DataType
+from pylibcudf cimport DataType as plc_DataType, Column as plc_Column
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport (
     make_numeric_column
 )
 from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
@@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index):
     )
 
 
+def dtype_from_pylibcudf_column(plc_Column col not None):
+    type_ = col.type()
+    tid = type_.id()
+
+    if tid == pylibcudf.TypeId.LIST:
+        child = col.list_view().child()
+        return cudf.ListDtype(dtype_from_pylibcudf_column(child))
+    elif tid == pylibcudf.TypeId.STRUCT:
+        fields = {
+            str(i): dtype_from_pylibcudf_column(col.child(i))
+            for i in range(col.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == pylibcudf.TypeId.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    elif tid == pylibcudf.TypeId.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-type_.scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
+
+
+cdef dtype_from_lists_column_view(column_view cv):
+    # lists_column_view have no default constructor, so we heap
+    # allocate it to get around Cython's limitation of requiring
+    # default constructors for stack allocated objects
+    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
+    cdef column_view child = lv.get()[0].child()
+
+    if child.type().id() == libcudf_types.type_id.LIST:
+        return cudf.ListDtype(dtype_from_lists_column_view(child))
+    else:
+        return cudf.ListDtype(dtype_from_column_view(child))
+
+
+cdef dtype_from_column_view(column_view cv):
+    cdef libcudf_types.type_id tid = cv.type().id()
+    if tid == libcudf_types.type_id.LIST:
+        return dtype_from_lists_column_view(cv)
+    elif tid == libcudf_types.type_id.STRUCT:
+        fields = {
+            str(i): dtype_from_column_view(cv.child(i))
+            for i in range(cv.num_children())
+        }
+        return cudf.StructDtype(fields)
+    elif tid == libcudf_types.type_id.DECIMAL64:
+        return cudf.Decimal64Dtype(
+            precision=cudf.Decimal64Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL32:
+        return cudf.Decimal32Dtype(
+            precision=cudf.Decimal32Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    elif tid == libcudf_types.type_id.DECIMAL128:
+        return cudf.Decimal128Dtype(
+            precision=cudf.Decimal128Dtype.MAX_PRECISION,
+            scale=-cv.type().scale()
+        )
+    else:
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[<int32_t>(tid)]
+
+
 cdef class Column:
     """
     A Column stores columnar data in device memory.
@@ -361,7 +433,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -424,7 +496,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
+        cdef plc_DataType dtype = <plc_DataType?>dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index fd6d0257940..65607c91302 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import copy
 
@@ -14,17 +14,16 @@ import pylibcudf as plc
 
 import cudf
 from cudf.core.dtypes import ListDtype, StructDtype
-from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
+from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
-from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
+from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 def _replace_nested(obj, check, replacement):
@@ -223,40 +222,22 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef plc_TypeID cdtype_id = self.c_value.type().id()
+        cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
         elif cdtype_id in {
-            plc_TypeID.DECIMAL32,
-            plc_TypeID.DECIMAL64,
-            plc_TypeID.DECIMAL128,
+            plc.TypeID.DECIMAL32,
+            plc.TypeID.DECIMAL64,
+            plc.TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype_id == plc_TypeID.STRUCT:
-            struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
-            self._dtype = StructDtype({
-                str(i): dtype_from_column_view(struct_table_view.column(i))
-                for i in range(struct_table_view.num_columns())
-            })
-        elif cdtype_id == plc_TypeID.LIST:
-            if (
-                <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == plc_TypeID.LIST:
-                self._dtype = dtype_from_column_view(
-                    (<list_scalar*>self.get_raw_ptr())[0].view()
-                )
-            else:
-                self._dtype = ListDtype(
-                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                        <underlying_type_t_type_id>(
-                            (<list_scalar*>self.get_raw_ptr())[0]
-                            .view().type().id()
-                        )
-                    ]
-                )
+        elif cdtype_id == plc.TypeID.STRUCT:
+            self._dtype = StructDtype.from_arrow(
+                plc.interop.to_arrow(self.c_value).type
+            )
+        elif cdtype_id == plc.TypeID.LIST:
+            self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type)
         else:
-            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype_id)
-            ]
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id]
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
deleted file mode 100644
index 18b1d26e4db..00000000000
--- a/python/cudf/cudf/_lib/types.pxd
+++ /dev/null
@@ -1,11 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-
-ctypedef int32_t underlying_type_t_type_id
-
-cdef dtype_from_column_view(column_view cv)
-
-cpdef dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
deleted file mode 100644
index 777bd070b32..00000000000
--- a/python/cudf/cudf/_lib/types.pyx
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-
-from libcpp.memory cimport make_shared, shared_ptr
-
-cimport pylibcudf.libcudf.types as libcudf_types
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-
-import pylibcudf as plc
-
-import cudf
-
-
-SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
-    np.dtype("int8"): plc.types.TypeId.INT8,
-    np.dtype("int16"): plc.types.TypeId.INT16,
-    np.dtype("int32"): plc.types.TypeId.INT32,
-    np.dtype("int64"): plc.types.TypeId.INT64,
-    np.dtype("uint8"): plc.types.TypeId.UINT8,
-    np.dtype("uint16"): plc.types.TypeId.UINT16,
-    np.dtype("uint32"): plc.types.TypeId.UINT32,
-    np.dtype("uint64"): plc.types.TypeId.UINT64,
-    np.dtype("float32"): plc.types.TypeId.FLOAT32,
-    np.dtype("float64"): plc.types.TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): plc.types.TypeId.STRING,
-    np.dtype("bool"): plc.types.TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
-}
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    plc_type: np_type
-    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
-}
-# There's no equivalent to EMPTY in cudf.  We translate EMPTY
-# columns from libcudf to ``int8`` columns of all nulls in Python.
-# ``int8`` is chosen because it uses the least amount of memory.
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object")
-PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
-
-
-size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
-
-
-cdef dtype_from_lists_column_view(column_view cv):
-    # lists_column_view have no default constructor, so we heap
-    # allocate it to get around Cython's limitation of requiring
-    # default constructors for stack allocated objects
-    cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv)
-    cdef column_view child = lv.get()[0].child()
-
-    if child.type().id() == libcudf_types.type_id.LIST:
-        return cudf.ListDtype(dtype_from_lists_column_view(child))
-    elif child.type().id() == libcudf_types.type_id.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_column_view(child)
-        )
-
-cdef dtype_from_structs_column_view(column_view cv):
-    fields = {
-        str(i): dtype_from_column_view(cv.child(i))
-        for i in range(cv.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-cdef dtype_from_column_view(column_view cv):
-    cdef libcudf_types.type_id tid = cv.type().id()
-    if tid == libcudf_types.type_id.LIST:
-        return dtype_from_lists_column_view(cv)
-    elif tid == libcudf_types.type_id.STRUCT:
-        return dtype_from_structs_column_view(cv)
-    elif tid == libcudf_types.type_id.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    elif tid == libcudf_types.type_id.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-cv.type().scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-            <underlying_type_t_type_id>(tid)
-        ]
-
-
-cpdef dtype_to_pylibcudf_type(dtype):
-    if isinstance(dtype, cudf.ListDtype):
-        return plc.DataType(plc.TypeId.LIST)
-    elif isinstance(dtype, cudf.StructDtype):
-        return plc.DataType(plc.TypeId.STRUCT)
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = plc.TypeId.DECIMAL128
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = plc.TypeId.DECIMAL64
-        return plc.DataType(tid, -dtype.scale)
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = plc.TypeId.DECIMAL32
-        return plc.DataType(tid, -dtype.scale)
-    # libcudf types don't support timezones so convert to the base type
-    elif isinstance(dtype, pd.DatetimeTZDtype):
-        dtype = np.dtype(f"<M8[{dtype.unit}]")
-    else:
-        dtype = np.dtype(dtype)
-    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
-
-
-def dtype_from_pylibcudf_lists_column(col):
-    child = col.list_view().child()
-    tid = child.type().id()
-
-    if tid == plc.TypeId.LIST:
-        return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
-    elif tid == plc.TypeId.EMPTY:
-        return cudf.ListDtype("int8")
-    else:
-        return cudf.ListDtype(
-            dtype_from_pylibcudf_column(child)
-        )
-
-
-def dtype_from_pylibcudf_structs_column(col):
-    fields = {
-        str(i): dtype_from_pylibcudf_column(col.child(i))
-        for i in range(col.num_children())
-    }
-    return cudf.StructDtype(fields)
-
-
-def dtype_from_pylibcudf_column(col):
-    type_ = col.type()
-    tid = type_.id()
-
-    if tid == plc.TypeId.LIST:
-        return dtype_from_pylibcudf_lists_column(col)
-    elif tid == plc.TypeId.STRUCT:
-        return dtype_from_pylibcudf_structs_column(col)
-    elif tid == plc.TypeId.DECIMAL64:
-        return cudf.Decimal64Dtype(
-            precision=cudf.Decimal64Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL32:
-        return cudf.Decimal32Dtype(
-            precision=cudf.Decimal32Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    elif tid == plc.TypeId.DECIMAL128:
-        return cudf.Decimal128Dtype(
-            precision=cudf.Decimal128Dtype.MAX_PRECISION,
-            scale=-type_.scale()
-        )
-    else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2806a1f6c23..5f90439f86f 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -10,7 +10,6 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core._internals import copying
@@ -24,7 +23,11 @@
 from cudf.core.copy_types import GatherMap
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
-from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    is_mixed_with_object_dtype,
+)
 from cudf.utils.utils import _is_same_name
 
 if TYPE_CHECKING:
@@ -2047,7 +2050,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if gather_map.dtype.kind not in "iu":
-            gather_map = gather_map.astype(size_type_dtype)
+            gather_map = gather_map.astype(SIZE_TYPE_DTYPE)
 
         GatherMap(gather_map, len(self), nullify=not check_bounds or nullify)
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
index 1d21d34b1bf..e6e6c3bcedf 100644
--- a/python/cudf/cudf/core/_internals/aggregation.py
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Literal
@@ -8,9 +8,9 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf.api.types import is_scalar
 from cudf.utils import cudautils
+from cudf.utils.dtypes import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 
 if TYPE_CHECKING:
     from collections.abc import Callable
diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py
index 212150f505e..a9023f8fd59 100644
--- a/python/cudf/cudf/core/_internals/binaryop.py
+++ b/python/cudf/cudf/core/_internals/binaryop.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
@@ -6,8 +6,8 @@
 import pylibcudf as plc
 
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py
index 3b8e3db60a7..c45c4a1b5cf 100644
--- a/python/cudf/cudf/core/_internals/unary.py
+++ b/python/cudf/cudf/core/_internals/unary.py
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
 import pylibcudf as plc
 
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_decimal_dtype
 from cudf.core.buffer import acquire_spill_lock
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d705b4d4c21..b9d6c0e7f08 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -12,12 +12,12 @@
 from typing_extensions import Self
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     find_common_type,
     is_mixed_with_object_dtype,
     min_signed_type,
@@ -1140,7 +1140,7 @@ def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
             # self.categories is empty; just return codes
             return self.codes
-        gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0)
+        gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0)
         out = self.categories.take(gather_map)
         out = out.set_mask(self.mask)
         return out
@@ -1192,10 +1192,10 @@ def _concat(
         codes = [o.codes for o in objs]
 
         newsize = sum(map(len, codes))
-        if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+        if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
             raise MemoryError(
                 f"Result of concat cannot have "
-                f"size > {libcudf.types.size_type_dtype}_MAX"
+                f"size > {SIZE_TYPE_DTYPE}_MAX"
             )
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e23ca810065..30da8727366 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -25,7 +25,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -60,9 +59,11 @@
 from cudf.core.mixins import BinaryOperand, Reducible
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
     cudf_dtype_to_pa_type,
+    dtype_to_pylibcudf_type,
     find_common_type,
     get_time_unit,
     is_column_like,
@@ -874,7 +875,7 @@ def indices_of(
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
         return apply_boolean_mask(  # type: ignore[return-value]
-            [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
+            [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask
         )[0]
 
     def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
@@ -954,7 +955,7 @@ def take(
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
         if indices.dtype.kind not in {"u", "i"}:
-            indices = indices.astype(libcudf.types.size_type_dtype)
+            indices = indices.astype(SIZE_TYPE_DTYPE)
         GatherMap(indices, len(self), nullify=not check_bounds or nullify)
         gathered = copying.gather([self], indices, nullify=nullify)  # type: ignore[arg-type]
         return gathered[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
@@ -1743,9 +1744,7 @@ def column_empty(
     elif isinstance(dtype, ListDtype):
         data = None
         children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
+            as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),
             column_empty(row_count, dtype=dtype.element_type),
         )
     elif isinstance(dtype, CategoricalDtype):
@@ -1754,21 +1753,16 @@ def column_empty(
             cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
-                        size=row_count
-                        * cudf.dtype(libcudf.types.size_type_dtype).itemsize
+                        size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize
                     )
                 ),
                 size=None,
-                dtype=libcudf.types.size_type_dtype,
+                dtype=SIZE_TYPE_DTYPE,
             ),
         )
     elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype):
         data = as_buffer(rmm.DeviceBuffer(size=0))
-        children = (
-            as_column(
-                0, length=row_count + 1, dtype=libcudf.types.size_type_dtype
-            ),
-        )
+        children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),)
     else:
         data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
@@ -2552,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         )
 
     newsize = sum(map(len, objs))
-    if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
+    if newsize > np.iinfo(SIZE_TYPE_DTYPE).max:
         raise MemoryError(
-            f"Result of concat cannot have "
-            f"size > {libcudf.types.size_type_dtype}_MAX"
+            f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX"
         )
     elif newsize == 0:
         return column_empty(0, head.dtype)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 6fc2b5d4ca2..04b4003c510 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
@@ -22,6 +21,7 @@
 from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -258,7 +258,7 @@ def from_sequences(
 
         offset_col = cast(
             NumericalColumn,
-            column.as_column(offset_vals, dtype=size_type_dtype),
+            column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE),
         )
 
         # Build ListColumn
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 20eded9a27f..2bee85cb387 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -19,16 +19,18 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import can_convert_to_column
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    can_convert_to_column,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
@@ -5611,7 +5613,7 @@ def __init__(
         if len(children) == 0 and size != 0:
             # all nulls-column:
             offsets = column.as_column(
-                0, length=size + 1, dtype=size_type_dtype
+                0, length=size + 1, dtype=SIZE_TYPE_DTYPE
             )
 
             children = (offsets,)
@@ -5888,7 +5890,7 @@ def as_decimal_column(
     ) -> cudf.core.column.DecimalBaseColumn:
         plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
             self.to_pylibcudf(mode="read"),
-            libcudf.types.dtype_to_pylibcudf_type(dtype),
+            dtype_to_pylibcudf_type(dtype),
         )
         result = Column.from_pylibcudf(plc_column)
         result.dtype.precision = dtype.precision  # type: ignore[union-attr]
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 4b6ad59c8e1..aaaf6c7ee4f 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, cast
 
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.types import size_type_dtype
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 if TYPE_CHECKING:
     from cudf.core.column import NumericalColumn
@@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             # Alternately we can have an Optional[Column] and handle None
             # specially in _gather.
             self.column = cast(
-                "NumericalColumn", self.column.astype(size_type_dtype)
+                "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE)
             )
         else:
             if self.column.dtype.kind not in {"i", "u"}:
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 8ed233ba737..ce7fb968069 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import decimal
@@ -57,7 +57,8 @@ def dtype(arbitrary):
         if np_dtype.kind in set("OU"):
             return np.dtype("object")
         elif (
-            np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+            np_dtype
+            not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
         ):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 17302311a7e..7bc4b08fc49 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -21,7 +21,6 @@
 import cudf
 import cudf.core._internals
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     is_list_like,
@@ -46,7 +45,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.dtypes import cudf_dtype_to_pa_type
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
@@ -588,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         offsets, group_keys, (indices,) = self._groups(
             [
                 cudf.core.column.as_column(
-                    range(len(self.obj)), dtype=size_type_dtype
+                    range(len(self.obj)), dtype=SIZE_TYPE_DTYPE
                 )
             ]
         )
@@ -1185,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         # aggregation scheme in libcudf. This is probably "fast
         # enough" for most reasonable input sizes.
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         # "Out of bounds" n for the group size either means no entries
         # (negative) or all the entries (positive)
@@ -1199,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             group_offsets = group_offsets[:-1]
         else:
             group_offsets = group_offsets[1:] - size_per_group
-        to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype)
+        to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
         fixup = np.empty_like(size_per_group)
         fixup[0] = 0
         np.cumsum(size_per_group[:-1], out=fixup[1:])
@@ -1500,11 +1499,11 @@ def sample(
         # into a numpy array directly, rather than a list.
         # TODO: this uses the sort-based groupby, could one use hash-based?
         _, offsets, _, group_values = self._grouped()
-        group_offsets = np.asarray(offsets, dtype=size_type_dtype)
+        group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE)
         size_per_group = np.diff(group_offsets)
         if n is not None:
             samples_per_group = np.broadcast_to(
-                size_type_dtype.type(n), size_per_group.shape
+                SIZE_TYPE_DTYPE.type(n), size_per_group.shape
             )
             if not replace and (minsize := size_per_group.min()) < n:
                 raise ValueError(
@@ -1517,7 +1516,7 @@ def sample(
             # which is round-to-nearest, ties to sgn(x) * inf).
             samples_per_group = np.round(
                 size_per_group * frac, decimals=0
-            ).astype(size_type_dtype)
+            ).astype(SIZE_TYPE_DTYPE)
         if replace:
             # We would prefer to use cupy here, but their rng.integers
             # interface doesn't take array-based low and high
@@ -1525,7 +1524,7 @@ def sample(
             low = 0
             high = np.repeat(size_per_group, samples_per_group)
             rng = np.random.default_rng(seed=random_state)
-            indices = rng.integers(low, high, dtype=size_type_dtype)
+            indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE)
             indices += np.repeat(group_offsets[:-1], samples_per_group)
         else:
             # Approach: do a segmented argsort of the index array and take
@@ -1533,7 +1532,7 @@ def sample(
             # We will shuffle the group indices and then pick them out
             # from the grouped dataframe index.
             nrows = len(group_values)
-            indices = cp.arange(nrows, dtype=size_type_dtype)
+            indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE)
             if len(size_per_group) < 500:
                 # Empirically shuffling with cupy is faster at this scale
                 rs = cp.random.get_random_state()
@@ -1557,7 +1556,7 @@ def sample(
                     indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
-            want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
+            want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE)
             scan = np.empty_like(samples_per_group)
             scan[0] = 0
             np.cumsum(samples_per_group[:-1], out=scan[1:])
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b535e8aabd2..0d1bf552982 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -19,7 +19,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -53,6 +52,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = [self._range.index(value)]
         except ValueError:
             i = []
-        return as_column(i, dtype=size_type_dtype)
+        return as_column(i, dtype=SIZE_TYPE_DTYPE)
 
     def isin(self, values, level=None):
         if level is not None and level > 0:
@@ -1348,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = as_column(
             -1,
             length=len(needle),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
 
         if not len(self):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index eded681baf0..4c6f8a9c152 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -60,6 +60,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
@@ -3034,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                         NumericalColumn,
                         as_column(
                             range(start, stop, stride),
-                            dtype=libcudf.types.size_type_dtype,
+                            dtype=SIZE_TYPE_DTYPE,
                         ),
                     ),
                     len(self),
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 6e965ceca66..ce7edc8fdbe 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from typing import Any
@@ -7,7 +7,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
@@ -17,6 +16,7 @@
     _IndexIndexer,
     _match_join_keys,
 )
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 
 class Merge:
@@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols):
         # tables, we gather from iota on both right and left, and then
         # sort the gather maps with those two columns as key.
         key_order = [
-            cudf.core.column.as_column(range(n), dtype=size_type_dtype).take(
+            cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take(
                 map_, nullify=null, check_bounds=False
             )
             for map_, n, null in zip(maps, lengths, nullify)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index e7efd01ca85..64ec099cb39 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -17,7 +17,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
@@ -34,7 +33,7 @@
     ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
-from cudf.utils.dtypes import is_column_like
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -199,7 +198,7 @@ def __init__(
                     )
                 if lo == -1:
                     # Now we can gather and insert null automatically
-                    code[code == -1] = np.iinfo(size_type_dtype).min
+                    code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min
             result_col = level._column.take(code, nullify=True)
             source_data[i] = result_col._with_type_metadata(level.dtype)
 
@@ -1578,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index:
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
-        # cudf uses np.iinfo(size_type_dtype).min as missing code
+        # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code
         # pandas uses -1 as missing code
         pd_codes = (
             code.find_and_replace(
-                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1),
                 column.as_column(-1, length=1),
             )
             for code in self._codes
@@ -1903,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         result = column.as_column(
             -1,
             length=len(target),
-            dtype=libcudf.types.size_type_dtype,
+            dtype=SIZE_TYPE_DTYPE,
         )
         if not len(self):
             return _return_get_indexer_result(result.values)
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 0abd42d4d4e..eedd777aafe 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -12,13 +12,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.utils.dtypes import min_unsigned_type
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type
 
 if TYPE_CHECKING:
     from cudf._typing import Dtype
@@ -1333,10 +1332,10 @@ def _one_hot_encode_column(
         else:
             column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
-    if column.size * categories.size >= np.iinfo(size_type_dtype).max:
+    if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max:
         raise ValueError(
             "Size limitation exceeded: column.size * category.size < "
-            f"np.iinfo({size_type_dtype}).max. Consider reducing "
+            f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing "
             "size of category"
         )
     result_labels = (
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 6d617cbf38e..7e8468c8e8a 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import errno
@@ -16,11 +16,13 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_hashable, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 
 _CSV_HEX_TYPE_MAP = {
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index ff326e09315..16c7d189dfd 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import os
@@ -14,10 +14,12 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
-from cudf.utils.dtypes import _maybe_convert_to_default_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    dtype_to_pylibcudf_type,
+)
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index f3124552fd1..0ac2950a22b 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import itertools
@@ -11,11 +11,11 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_list_like
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.index import _index_from_data
 from cudf.utils import ioutils
+from cudf.utils.dtypes import dtype_to_pylibcudf_type
 
 try:
     import ujson as json  # type: ignore[import-untyped]
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 31a8f4de3b3..9e932acb5fa 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import datetime
@@ -11,6 +11,8 @@
 import pyarrow as pa
 from pandas.core.dtypes.common import infer_dtype_from_object
 
+import pylibcudf as plc
+
 import cudf
 
 if TYPE_CHECKING:
@@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif cudf.api.types.is_decimal128_dtype(dtype):
         return cudf.core.dtypes.Decimal128Dtype
-    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
+    elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)
@@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
         return dtype.base
 
 
+def dtype_to_pylibcudf_type(dtype) -> plc.DataType:
+    if isinstance(dtype, cudf.ListDtype):
+        return plc.DataType(plc.TypeId.LIST)
+    elif isinstance(dtype, cudf.StructDtype):
+        return plc.DataType(plc.TypeId.STRUCT)
+    elif isinstance(dtype, cudf.Decimal128Dtype):
+        tid = plc.TypeId.DECIMAL128
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal64Dtype):
+        tid = plc.TypeId.DECIMAL64
+        return plc.DataType(tid, -dtype.scale)
+    elif isinstance(dtype, cudf.Decimal32Dtype):
+        tid = plc.TypeId.DECIMAL32
+        return plc.DataType(tid, -dtype.scale)
+    # libcudf types don't support timezones so convert to the base type
+    elif isinstance(dtype, pd.DatetimeTZDtype):
+        dtype = _get_base_dtype(dtype)
+    else:
+        dtype = np.dtype(dtype)
+    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
+
+
+SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
+    np.dtype("int8"): plc.types.TypeId.INT8,
+    np.dtype("int16"): plc.types.TypeId.INT16,
+    np.dtype("int32"): plc.types.TypeId.INT32,
+    np.dtype("int64"): plc.types.TypeId.INT64,
+    np.dtype("uint8"): plc.types.TypeId.UINT8,
+    np.dtype("uint16"): plc.types.TypeId.UINT16,
+    np.dtype("uint32"): plc.types.TypeId.UINT32,
+    np.dtype("uint64"): plc.types.TypeId.UINT64,
+    np.dtype("float32"): plc.types.TypeId.FLOAT32,
+    np.dtype("float64"): plc.types.TypeId.FLOAT64,
+    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
+    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
+    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
+    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
+    np.dtype("object"): plc.types.TypeId.STRING,
+    np.dtype("bool"): plc.types.TypeId.BOOL8,
+    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
+    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
+    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
+    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
+}
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
+    plc_type: np_type
+    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
+}
+# There's no equivalent to EMPTY in cudf.  We translate EMPTY
+# columns from libcudf to ``int8`` columns of all nulls in Python.
+# ``int8`` is chosen because it uses the least amount of memory.
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype(
+    "object"
+)
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
+
+
+SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
+
 # Type dispatch loops similar to what are found in `np.add.types`
 # In NumPy, whether or not an op can be performed between two
 # operands is determined by checking to see if NumPy has a c/c++

From 559cda24e4258da1aa35b7de60f46e8a86b1effa Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 9 Jan 2025 19:18:27 -0800
Subject: [PATCH 03/12] Use 64-bit offsets only if the current strings column
 output chunk size exceeds threshold (#17693)

This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/17693
---
 cpp/src/io/parquet/reader_impl.cpp          | 48 +++++++--------------
 cpp/src/io/parquet/reader_impl_chunking.hpp |  5 +--
 2 files changed, 17 insertions(+), 36 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index c48ff896e33..f9fcca6bb4f 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                              _stream);
     }
 
-    // Compute column string sizes (using page string offsets) for this subpass
+    // Compute column string sizes (using page string offsets) for this output table chunk
     col_string_sizes = calculate_page_string_offsets();
 
-    // ensure cumulative column string sizes have been initialized
-    if (pass.cumulative_col_string_sizes.empty()) {
-      pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0);
-    }
-
-    // Add to the cumulative column string sizes of this pass
-    std::transform(pass.cumulative_col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.end(),
-                   col_string_sizes.begin(),
-                   pass.cumulative_col_string_sizes.begin(),
-                   std::plus<>{});
-
     // Check for overflow in cumulative column string sizes of this pass so that the page string
     // offsets of overflowing (large) string columns are treated as 64-bit.
     auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
-    auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(),
-                                               pass.cumulative_col_string_sizes.cend(),
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
                                                [=](std::size_t sz) { return sz > threshold; });
     if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
 
-    // Mark any chunks for which the cumulative column string size has exceeded the
-    // large strings threshold
-    if (has_large_strings) {
-      for (auto& chunk : pass.chunks) {
-        auto const idx = chunk.src_col_index;
-        if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
-      }
+    // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output
+    // column chunks and the large strings threshold.
+    for (auto& chunk : pass.chunks) {
+      auto const idx            = chunk.src_col_index;
+      chunk.is_large_string_col = (col_string_sizes[idx] > threshold);
     }
   }
 
@@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         // only do string buffer for leaf
         if (idx == max_depth - 1 and out_buf.string_size() == 0 and
             col_string_sizes[pass.chunks[c].src_col_index] > 0) {
-          out_buf.create_string_data(
-            col_string_sizes[pass.chunks[c].src_col_index],
-            pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] >
-              static_cast<size_t>(strings::detail::get_offset64_threshold()),
-            _stream);
+          out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index],
+                                     pass.chunks[c].is_large_string_col,
+                                     _stream);
         }
         if (has_strings) { str_data[idx] = out_buf.string_data(); }
         out_buf.user_data |=
@@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
         final_offsets.emplace_back(offset);
         out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
       } else if (out_buf.type.id() == type_id::STRING) {
-        // need to cap off the string offsets column
-        auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        if (sz <= strings::detail::get_offset64_threshold()) {
+        // only if it is not a large strings column
+        if (col_string_sizes[idx] <=
+            static_cast<size_t>(strings::detail::get_offset64_threshold())) {
           out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
-          final_offsets.emplace_back(sz);
+          final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
         }
       }
     }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index ca46f198bb8..4a773fbced1 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,9 +130,6 @@ struct pass_intermediate_data {
   rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()};
   rmm::device_uvector<string_index_pair> str_dict_index{0, cudf::get_default_stream()};
 
-  // cumulative strings column sizes.
-  std::vector<size_t> cumulative_col_string_sizes{};
-
   int level_type_size{0};
 
   // skip_rows / num_rows for this pass.

From fb2413e1505297e737095d97e0732eec52519802 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 10 Jan 2025 10:06:35 -0800
Subject: [PATCH 04/12] Make tests build without relaxed constexpr (#17691)

Contributes to https://github.com/rapidsai/cudf/issues/7795

This PR updates tests to build without depending on the relaxed constexpr build option.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17691
---
 .../cudf/detail/utilities/integer_utils.hpp   | 12 +++--
 cpp/include/cudf/utilities/span.hpp           | 40 +++++++++------
 cpp/src/io/utilities/parsing_utils.cuh        | 49 ++++++++++---------
 cpp/src/io/utilities/trie.cuh                 |  8 ++-
 .../transform/segmented_row_bit_count_test.cu |  4 +-
 cpp/tests/utilities/column_utilities.cu       | 18 ++++---
 6 files changed, 75 insertions(+), 56 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 2e3d71815c0..44a86f1c84f 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
  */
 
 #include <cudf/fixed_point/temporary.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <cmath>
 #include <cstdlib>
@@ -44,13 +46,17 @@ namespace util {
  * `modulus` is positive. The safety is in regard to rollover.
  */
 template <typename S>
-constexpr S round_up_safe(S number_to_round, S modulus)
+CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
   auto rounded_up = number_to_round - remainder + modulus;
   if (rounded_up < number_to_round) {
-    throw std::invalid_argument("Attempt to round up beyond the type's maximum value");
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error);
+#else
+    CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value");
+#endif
   }
   return rounded_up;
 }
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index e7b76946248..b5044a58934 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 
   constexpr host_span() noexcept : base() {}  // required to compile on centos
 
-  /// Constructor from pointer and size
-  /// @param data Pointer to the first element in the span
-  /// @param size The number of elements in the span
-  /// @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
-  constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
+  /**
+   * @brief Constructor from pointer and size
+   *
+   * @note This needs to be host-device , as it's used by a host-device function in base_2dspan
+   *
+   * @param data Pointer to the first element in the span
+   * @param size The number of elements in the span
+   * @param is_device_accessible Whether the data is device accessible (e.g. pinned memory)
+   */
+  CUDF_HOST_DEVICE constexpr host_span(T* data, std::size_t size, bool is_device_accessible)
     : base(data, size), _is_device_accessible{is_device_accessible}
   {
   }
@@ -311,8 +316,8 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  [[nodiscard]] constexpr host_span subspan(typename base::size_type offset,
-                                            typename base::size_type count) const noexcept
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr host_span subspan(
+    typename base::size_type offset, typename base::size_type count) const noexcept
   {
     return host_span{this->data() + offset, count, _is_device_accessible};
   }
@@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  [[nodiscard]] constexpr device_span subspan(typename base::size_type offset,
-                                              typename base::size_type count) const noexcept
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr device_span subspan(
+    typename base::size_type offset, typename base::size_type count) const noexcept
   {
     return device_span{this->data() + offset, count};
   }
@@ -475,28 +480,28 @@ class base_2dspan {
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); }
 
   /**
    * @brief Returns the size in the span as pair.
    *
    * @return pair representing rows and columns size of the span
    */
-  [[nodiscard]] constexpr auto size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return Number of elements in the span
    */
-  [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); }
 
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; }
 
   /**
    * @brief Returns a reference to the row-th element of the sequence.
@@ -507,7 +512,7 @@ class base_2dspan {
    * @param row the index of the element to access
    * @return A reference to the row-th element of the sequence, i.e., `data()[row]`
    */
-  constexpr RowType<T, dynamic_extent> operator[](size_t row) const
+  CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> operator[](size_t row) const
   {
     return _flat.subspan(row * _size.second, _size.second);
   }
@@ -517,7 +522,10 @@ class base_2dspan {
    *
    * @return A flattened span of the 2D span
    */
-  [[nodiscard]] constexpr RowType<T, dynamic_extent> flat_view() const { return _flat; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr RowType<T, dynamic_extent> flat_view() const
+  {
+    return _flat;
+  }
 
   /**
    * @brief Construct a 2D span from another 2D span of convertible type
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 75e45a68842..9833dab282e 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -171,7 +171,10 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag)
 }
 
 // Converts character to lowercase.
-constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; }
+CUDF_HOST_DEVICE constexpr char to_lower(char const c)
+{
+  return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c;
+}
 
 /**
  * @brief Checks if string is infinity, case insensitive with/without sign
@@ -515,13 +518,13 @@ struct ConvertFunctor {
   template <typename T,
             CUDF_ENABLE_IF(std::is_integral_v<T> and !std::is_same_v<T, bool> and
                            !cudf::is_fixed_point<T>())>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex = false)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex = false)
   {
     auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
@@ -564,13 +567,13 @@ struct ConvertFunctor {
    * @brief Dispatch for boolean type types.
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, bool>)>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex)
   {
     auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
@@ -593,13 +596,13 @@ struct ConvertFunctor {
    * is not valid. In such case, the validity mask is set to zero too.
    */
   template <typename T, CUDF_ENABLE_IF(std::is_floating_point_v<T>)>
-  __host__ __device__ __forceinline__ bool operator()(char const* begin,
-                                                      char const* end,
-                                                      void* out_buffer,
-                                                      size_t row,
-                                                      data_type const output_type,
-                                                      parse_options_view const& opts,
-                                                      bool as_hex)
+  __device__ __forceinline__ bool operator()(char const* begin,
+                                             char const* end,
+                                             void* out_buffer,
+                                             size_t row,
+                                             data_type const output_type,
+                                             parse_options_view const& opts,
+                                             bool as_hex)
   {
     auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index c0efc5b6f20..dbdc4a34277 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,16 +74,14 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
 /*
  * @brief Searches for a string in a serialized trie.
  *
- * Can be executed on host or device, as long as the data is available
- *
  * @param trie Pointer to the array of nodes that make up the trie
  * @param key Pointer to the start of the string to find
  * @param key_len Length of the string to find
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                      device_span<char const> key)
+__device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu
index 652b9053582..0e4f623f0a2 100644
--- a/cpp/tests/transform/segmented_row_bit_count_test.cu
+++ b/cpp/tests/transform/segmented_row_bit_count_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,7 @@ compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type s
         // Since the number of rows may not divisible by segment_length,
         // the last segment may be shorter than the others.
         auto const size_begin = d_sizes + segment_idx * segment_length;
-        auto const size_end   = std::min(size_begin + segment_length, d_sizes + num_rows);
+        auto const size_end   = cuda::std::min(size_begin + segment_length, d_sizes + num_rows);
         return thrust::reduce(thrust::seq, size_begin, size_end);
       }));
 
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index fb9bdeb0b22..6888f26fd16 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,6 +37,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
@@ -412,14 +414,16 @@ class corresponding_rows_not_equivalent {
         T const y = rhs.element<T>(rhs_index);
 
         // Must handle inf and nan separately
-        if (std::isinf(x) || std::isinf(y)) {
+        if (cuda::std::isinf(x) || cuda::std::isinf(y)) {
           return x != y;  // comparison of (inf==inf) returns true
-        } else if (std::isnan(x) || std::isnan(y)) {
-          return std::isnan(x) != std::isnan(y);  // comparison of (nan==nan) returns false
+        } else if (cuda::std::isnan(x) || cuda::std::isnan(y)) {
+          return cuda::std::isnan(x) !=
+                 cuda::std::isnan(y);  // comparison of (nan==nan) returns false
         } else {
-          T const abs_x_minus_y = std::abs(x - y);
-          return abs_x_minus_y >= std::numeric_limits<T>::min() &&
-                 abs_x_minus_y > std::numeric_limits<T>::epsilon() * std::abs(x + y) * fp_ulps;
+          T const abs_x_minus_y = cuda::std::abs(x - y);
+          return abs_x_minus_y >= cuda::std::numeric_limits<T>::min() &&
+                 abs_x_minus_y >
+                   cuda::std::numeric_limits<T>::epsilon() * cuda::std::abs(x + y) * fp_ulps;
         }
       } else {
         // if either is null, then the inequality was checked already

From dc2a75cba40d38f4a6ba66e652764e96fa6b593d Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Fri, 10 Jan 2025 13:22:39 -0500
Subject: [PATCH 05/12] Add special orc test data: timestamp interspersed with
 null values (#17713)

Authors:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17713
---
 ...e.timestamp.desynced.snappy.RLEv2.hasNull.orc | Bin 0 -> 5951 bytes
 ...stamp.desynced.uncompressed.RLEv2.hasNull.orc | Bin 0 -> 6565 bytes
 python/cudf/cudf/tests/test_orc.py               |   6 ++++++
 3 files changed, 6 insertions(+)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc
new file mode 100644
index 0000000000000000000000000000000000000000..8772f84c3ba2a7942323f49ac28c3e5b172a91fc
GIT binary patch
literal 5951
zcmaKwdvH_dnZ_T<x>?xR5lJv24X%8VbY$ylTk?@?*+;e|9od$AXd0?qoGRN57280S
zG&aHsjtl{0T3Dr7sAwCiY&WdvCaltKsEEN4O+!UD5U~v)ifu$LL?qbAfQ=0HI_b{r
z{<Y_i?|tTdf6sg~-#6#XoX5#nssJHh#1#w{x&9_T7P#;R@Mx)!5DIZ0YeMvZU}_0{
zU)ky4?E_(bdF<eCqhY=5bH4ignQ|9Ps1z2XnFB1azlGG&odu$A2+j-W`^y4HShAM#
z8}E%AT%_N<P_F*{&t)za0X+S{!e+t3!c5`9yZ|o=P*kw6FlAsN!7V?&@Z&E(dAET5
zkeda3&s_%2L$A130N?&ve&_KOU_DvtI=(RU$>ZBGzYY!Sz_4)7SG|3=|4MJiM<0FN
zSjS?0h5h~`a{X7BdmWT4P*38k=9?dUW4rG+tVN&lqp#cRa1V?5YL@uTum4!Qy><EG
z$neTT|M&Tpoxq-j!kiEguu#T^tN;cMSX`{(VJ#p10&Fitp%4d_;BXO66yw}d+*5+t
zQkLKzR+)$;TZR?)vTXOU*4)opyPVZuhHtK5?G>{QtYjU2fOX;<kUz+}vWhjE%Lu;7
zE|Wl0!8TN~ZB^_w)$FxWG|Jf9<?Ou#`+$OdxCX7Y>~l)?6%~6{T_DgDlxv67x}lc3
zAy@s-6WnDR^h1M&q1}x``^lk0+?Pj9L#LaDCYpz)%)6AIeq7qRrJUNLwrpu>+v4K=
z^ofow8?0LfZCiHRx9sQNd&se6v~$bpt}PSYTc-H0z_qosXKOjLRlRy^i{L)jLtCHt
z_SOv#Zyj8-b+_P^{qC)Y9@#qj=+@JZZJj8*Z>sktJn>TLcV8;^yrlj+p6z?QHGj`r
z`xLK#Eq?WV-d->7z&hUHf8d??0si%A-j(&d*$sTb5BX(2{QEO}!?S$bkN9hz<FD<*
z@B8`NH}d!T`3E-f4>J?*lAoP>fq!K)e|B)`(=QfX3kYOG0>c)8ZL46-OZae`pntny
z`wqe0py0qT&hHeQ__5&JPXt$X31)ZW(mxiKg$iXa7aD$AXnUpbiGSL$Vb6}iy*qaQ
zY{&kR6^Hii82$N<)4$j;@yi`kC1LCjmi{VO9to;n3$~PsUB3=K@h`y*uLlPY1b3H)
z_eX<=4hBbm6FmKI!HIjsQ*RFA&~WL$50}S=)%Wac`R%alcf(Ko$MA-?hX+L~cmIBP
ze|-4Rk>SyIhEI$3O&lGbI<^y|J4@f)S-$K6_3@o8f7t1IZ|4(#+_~XI(e^(T?M)UP
zI9YV~&#cz}EIRi=(Unt0v!{y%AF}MJV%eEu!+#aq{!+Z=BUaDZ;{J2R+s_y8O&1@y
z!1~T#i%(oEJ~vi;WxRNHg7w`?OUp7#WtW#4K3;13g!TPTm#+P6Y5$d_+dp5rH_Q6r
z)uo5OSbE~irRT0Ky)wz#I8`FJUQ(7TkxiEvM3-K?S+XWGxpt<ce^&VP?UMb^O&*%t
zHF{_0jk~)hR+PT6FbE8melStqvTE0<L(6WS|CH^2aeRYxa@P^g)Qbb7X4mpD&h5d$
z(oznG#hM#!DgD`ke2R7OS>tO{;m>DQ$45T8cI-LhTNmH>atg+gv1<$c#^Ynk>oYIE
z^A4-T>^Ghnd*kNJTO%XaCO2c~^0@N$%$cJjw<eFhV4NOR-<_F0HZniCFlZ{6P_d>;
zTaNOk%mLG~i8r~kx_95<mTul+s<@;ooL!wbdjHfh))w8RH;ZSVIezrPsfFz(OU5kf
z-}8qt>2+&x;GxV>j^OS0##Gn0?Hu^tWwW^d?28ip_4j`=@T1Ge9_*iaf6P4L+C8xK
zW3y!A@)Kj$oHf*cf90|2jru>0Idj{7I`HdHnt6o}C&wPnz5h=Gzx(8va%10%k=|+k
z-huZ%HETES`SaLQ6C1(<fBE!S!^XG&b8P+ewx9PuC^eHC&weo0H~s!E2X20LR8Tl`
zYHafj{{H8=SIm@u`RV!XH=L1?<yTI1<c%NR5#77%pGTyhJL&u*srkJ(PI2DTe}1Yv
zzv;~U{+lHSMy&6)GWmV~HGklyGdl8c_SCnhzV*=jp_{vYGxF3`r#pY)qdWKCcj}Fi
zzN@Dm&(EEmAH7w=eQ(<rPS4!RbMq%|Ib$Px-hJ`!=Zxp)Pv6@0+mYA5bb6=$&OLwb
z)~Wv(dHc&#PtR?-a7Vnn<nYM**POn&eSe+5GUJTjJ%4SyZ_}}{2d~fUd1uz3>fN+y
zeEh-NGiQ(9y*)X$ba-K6@!Pe0biQz^cW7YM(DB0AhZFNFrpC7om@}*HnV9{<y!?9a
z@Ide7RV!xC{&BWNGrntJ^T(^cd3*VZd3)~h(w(n<vPypY;pF^xa^rgjj%~4MZtwZ?
z{P(AO_YI7HwyN<`*$49*r^jC%Sh%vPHNX7ytgWv1wdYrTzRI3|I5q#X8{@CvwPejq
z{?)%UAH2E#;MCrAW_N!4Z1a(ulW*P~`obh#x$1oL$y@8+y8GIfX760DkaywM<lA?T
zUo-pWUcK0SHSv!}?p~fW`{%~To9AXGk4+ueU=G|_b;-PBcKz}Bit8rD1HG5cD`zL)
zpSR@9p*ycG;VW*hKRN&8v^jidoVVVX`0&*H&<%a$?y8U0JM!z#OdWnyAHCaq)%?h(
z`_9fEztwbo@t`-Y-n99zFMKxhnC4f5lbfHPn}2kiKl|tpk4j6|dQ6+|j7x9#o}7Pb
zV*M}f%&#c1UeceLf1Ulp!f4S8Uo1aXu=tj3=v6QKu;3=^`Q5$!C70N<qw}ub1Is7F
zPs%I>j-KCr*k`>cefF5}tn;GvM@@fu#`xG%KN>mP*E@>mn$Gt<b@BD*MlSTNzxaNi
z3ETQEe%3cK-nV&--`{kpZ`){T|H$ROU1QdbO`r7b8GCBuJFIQ5j&1Wdect!_*z5k0
zt9@^dmTzkMvhVG&&o+%r_8lAN4>Vn$`O~;+;NtYmhvUxYAG<kKv}*nH7iVTJjqiHl
zvD-7BkH3jab2Hb+Pi=ne?#%7+$;}rRrb^381|Mh57EY*E?VCF`(YsI)s0dbsD#8UX
zS41mf74eEhMY1AQk*>&8WDE9H<SW^g!b(x4xKdI{RB9{9g4ZhPN~Y3X>8bQq`YQdE
zfy!XP!OC!Dq%v9=tBhABDwCC|%5=e7mD$Q%Wxk4CC9D!viK`@4M8T0NvWlvrtC%Wx
zm8Z&E<*V{n1*<|;;i^bgv?^8=uS!%U3r<$0t1?yDs$5mRnq4id7FCN2&QufC+G?_z
zs-~-%YIn7#+FNj;+Fu>04pxV%!_|@MXmzYQUU0cOS)Hm*S7)lT)w$}tlr0q&T$PHY
z5-B0oN=YdtrKOD2T`(>6N_|qlG$0L1L(;G`B8?WzN#oLlG$~C<)6$GIE6qvsoC29p
zCX$I|5*Z=W%19X{qd7}tZkb2smHA|TSwI$)g=Aq_R2GxPWeHhQmXf7q8CjOIQkIvq
z<wChgE|yE=gj_2pITdnR&dA+zkK8Nw$^G(xJjhYV!}5qcDv!zI@`OAoPs!69ojfbg
z$@2u85E3FnOh^cVV<bp|B4~mk+=Pel5<bGuu@FHbM1+Y55hY?ooJbH!j)O=O86r#M
zh`fTW5Gq6pF=w@cP-qpTf>O{5M&VX?6kg6F3cn(t2r5E~up**}Dq@PbBB@9z(u#~C
ztH>$xHS8K8XI+iBMp8r6XluwCs)nv%YTTS>YP>bR8h=foCRh`y3D-nwqMS`N@tQ<U
zvL;oNuF2G7YjQPt&QPteR#YplmDCco+FG)fs--!@weDI^t+&=!>#q&e25UpLVb067
z(b`yTyf#srtWDLXYcsW3&c52blC2afMM|+!q9l}BCCPbBNh=wpTj^1Hl|H3k8Bhi}
z2bE!EL>X1alyPN3nN+5fX=PTKQ|47{l~5&8iB%F6!8xKLRg{WWF)Fvpqw=bJDnIAA
zDyRyn!m5ZWs*0)Ns)Q=ZIjKskGODa9r^>6@YN1-B7IV(13AI*Dswp+CX4G!ANA2ZY
zQ2W&Zbx<8rht&~vR2@^tIhWN*bxNI9XVh7BPMz1VHA2o+jaVbm5E`w9)KD5)!)V-`
zX^mIo)A%(3O;8ikgf$UOlryJ^YZ98ICZ$PhGMcO=r^#!DT9H<)m1qgAR!eFrEzMn`
zb!$CZuhysaYXjP#Hlz)6muaKgm^Q9WXp`EMHm%KQv)q;1ypF9C>O?xRPNE}pS{=!)
z(9t?Z=hk_2UY$?p*9CMzu0j{qMIL`r7uCgdaa}@})TMN3u1=TL<#hQvcAc<JR41;J
z)Dc``9a%@!(RECnyUtVRt@G9Sxt6+MU8pWx7paTZ#p>d9iMk}$QJ1dE)Me{(b@_UB
zy|7+XFRmx*we@5@RZrJ5_3nC4y_fq)y}v$CAFL16hwCHt(fU|@ocm;bvOZOxuFuqG
z>vQ$_26lsxyRJdpAZZ{Pv<+kf)j&5e4Q}o;4c-P{gTEor5NrrFgc~9aQSPROctfHg
zxqbl+DKw<fkU>Kh4LLOA!5xBL2)zh;G4vAX3Fx)Zlh9Mp)8Gz6?}pw3y%%~P^nU0A
z&<CLpK_3S9W$2^O$DofxpMX9IeG2+C^cm>0;O>Jy4+9$pAq*lI#4t!;AYjnKK*B)7
zz`)>!!2^RA1|JN57y>W^!955=7={Q8Q5a${#9>Ilkc1%xLmJ$-V93IdgCUPbHX4O!
z6roX!MhO}TaF3voL?eYp8jTDZ-Dvcn(TheO8vWoNM`I9;AvA{37(rtcjWIOF(U?GE
z65Nw$OrtS_#w;3hXv~9TgA{@kffR##1|$Je3z7s$fuun)Al)E6AidyT0O<!A02u@s
z0vQGw0T~4u0~rVRGRP#z6v#Bl49G0V9LPM3Y#4>$UWHK%qXb3*MlFmaj1-JCj0}u!
z7`-t1VD!TnfH4ST2*xms5g4Q3&cPUmF#%%|#uSWc7&9<tVa&mp2d@AoAxt8e#4t%<
zB4E<OM8ZVDM1!{kCO1qTn7lCgVDiHhfGG%52&OQ2%V3Ja6oV-aQv#+WOevVsFlAuM
zg0~W;Jet^O5~4|jCh@gD%%X`vlXg;a7`zHJ(P(1O<VKSRP2S17{b=%|DKPbj0Xzko
z!c&JA`6!xVXo{mLfu<yyQrBNz^y<)*y?%X>&!d^0a~jYrLbDjnlH8vcy+$;X)ACs~
z(`aU<w;e{a2hCnI`_Sx1bMS_zAI)JjM{bNR8e?dV-~7fRoCME-<}{i!XwITJcXReI
z%xsv2x4zvEvlzV9w|;AYSqn1>GX*mZGXt}G<|nf-d%=5T=8Hu>0CRAbUgX0tM_`W5
zzQ3sB;5~V}$^df;=Jf5rEX-M$b1>)8!bXb_ymk4<4x>ea79xM7A1x$WsJRsev@mFK
zgZB(tylC;E#XmPai<Tf-LU(!=`3PE~civp&<7i2sC5e_4TGD9A+zl>zb7;xm{cIMk
zLbQs|Dn_dWtpr-N^NGV~rO-+*l=q|6jaJXX3kJ0M(CSBP0Ifl^hR_;DYZR?9w8qhz
zKx-1MDYT~1nnCO0&F({M9uymt5R?d%7?cDQ0ZI#s1n)IaG$;m?8<Yo>7nBc_A5;KT
z5WIt+!k{9cqM%}+;-C_slAuzc(%`)XDhnzHDh~@A79lJmSj4bMU?IRe0t*QX1q%%e
z1B)9L4=i3-e6aYzI}S?_mJlpqSR$}QVTr*Kha~|^61<bJq+!Xxl7%G)OCD`(v<cBB
zLYo-8GiW2wrbQcxHVSPt+8DIC(dI#$7rYB-^P??*wjkO<XbYn)g0?8yVrYw_Es3@i
z+R|vtpe>8G9NO||XQN#R-c_`V(Jnzdfp#t0NwiaFr_s)!-3{I}+P!G^q1})60NR6S
z51~Db_6XXe;LV{uj`jrFlW0$&J&pDZ+OufSp*;_N0Xl@}5TQei4hcF4bZF5*qJu&Q
z4gL~zxY6N3hZh|_bokK`Kt~WAA#{YnUxtn-I%4REqa%TiBsx;)NTVZzjx6{qdj~u6
zR<>1W6<NhriIuQwtt7v~N?RGL+v>4;tv;*Y8n6cW3TxOJu|};iYuuW!Cao!J+L~Sb
z)6Co0Hla;q6Wb&<f^W2uHp)iZ7@OPXv3YGio1bs71#KZ)*cP!xZ82NimarxH4qMum
zv1M&JTi(vL3+*Din7`Ui*tK@jPT6TYV|Uv<b}#=CyWbwL2kjwy*dDP*?J;|t|D-)>
zPubJ<j6G}5+4D4;7V_88Vp>8Iw3a4mil%9XcJrU1y|j<^(*Zh2hv+aJp`-jwbevAm
zNjgQR=?tBvb9CMzbch^chr~fRv<}iiIcWZ{!|m`mybhnk?+7@8j*uhFf7uar#2j%)
z!jW{O9BD_!k>&4m<U84&!cI}AxKq+ebZR?E{%f6dC)4Tf^mKYVeVzW!KxdGDuru5l
z>5O*9I^&&*&SYn*GtGaiGuxT#%y+T7gk7R8ahIfv;2-HCyQnU@i|KNAdAht^zAiui
zcvrA1)D`ZEbVa*jUGc6&SCW6SE8Ugp%68?t^4;uiVYjGT+)Z?AyUA{<o9<@1-QAvU
zFaJWfzdO(!><)E@yCdDv?pSx6f4Mu^o$5|^XS%c9x$eA^?G*B_I>k<jlW=OCq?2;e
zPR8lxPdmL%pVRLQID^iRGwh5wqx?B%+?jADohfJ9nQ>;FIcHu_;1aq-F0o7EB3xP*
z>7rb;V2R7^^0>S%pUdwGxPq>bD=b*%in?O1xGUjGx>ByRE91%vR=V;%>>go{s7Kr*
z=^=WwJ!B8v!}Pd&JU!kXUyr{h&=V9Wdcr-Co@h_3C*G6jN%o|A(gIyiwkOw<XV{F8
z5iw#$!Vm%@LoyUYGYsQqJdBs|F@AxC2{Iuj%tV+d6Jz2`f=LP-Oq$6sStiHiSF=|O
zSBqAQ`xe7wo`6@1virWStTOjvMJcPSOvo3o7VE35MZsMZuM;B9;slE;yA<LAYuC4&
zi|#5RSMc$@i&69gf<N8+zoGO0#K?=G^Vccz`NbTzpmexaT(nr$myIdPn`Iy0D_J2Z
z60um~q91JRefIl5_}(+me)k6(${~Xq5`m!bZ%X~~z2x7vAphSjS_X@3EDu+!R4a9Y
a^`-2x!9k9MjUpML<^^sP6gU~n^8W^4hO9>b

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc
new file mode 100644
index 0000000000000000000000000000000000000000..f5a1edbb10ee55f131764e18b638c35c8e06b0cf
GIT binary patch
literal 6565
zcmcI}eRLD|neHQ5Uls;CA_*YU;L0D8Mz%giwk3~b%O2U5JhCl$XcDS4b*gMPRM`fq
zq_GiBaAXJ|vxQZfg^KQmD%%^b=uKFq-B1yOBbtVaZXlu!Ac}27K8Q%LkpUYS>~-7S
zy}fsvJ@?#m`uk(v=ltI1eV_N8-^`pd=XJ7Hu~5Pn4HUWX2k@b20m#Kd{BcR{6HP8+
z?pb~+cx!*yP#Qb%(`eYB_*AGpce>QY5v#=FMjqz5Ur8<5Q6Tw}=$weTcX{CO?@6WC
z-x)rzz<v9CsrKhTRJb_vn?&>T(}nYYd0Q|)KY48aZ$JO&?E>xtU$DOU1^W-Y^aYLw
zzhM7?uYQr^_Y$sS^MfCKQ5`RRLFE6R)SpfE9}=!(dvE>j&(!f3kotWc3vK`Wga1qJ
zFC_J+ZT}aEg*sk;?@#LZZ_nyaGyNxi=|9o`3I2=Jf4Kjwge&orAOG^->-c|s^1iZr
zzAz^SL>w&VLQw!C4;(($2(VU&J`uJRqEL+ei*Tq2#~0)565LgcnG%lZF3xfZN3j$u
z?&jF<;jForvvwJ$Z#llag0n};*}sx==swQzFF|=f=khAfOfDn(GIzNQx-zb@oNKS(
zuBqg%m7`w4-KOO3A-MZh+(T7puI8Rqb1!SSGui@?uAr10)anPDY6e}kgOBo;t~U%0
z7zcON5ALG|5At6eF%O<<7#wdLoV4szfBa!d^X5`|v(~z~sb#ZE@cl>IHm|pB9<XoT
z)xLS3@a}_-%_AL~Pjzk{@7g>md<m{CCEZ&}*)7`DTbe}oxE|Q@=!09<KeT0F&6ZuF
zm-e~09DI1o$k(=<dSuIZ;XRW*FW}J^O1|+zspkdlUkU8r6s-Ac!P>_KeQWWfZwdBz
z1^b^69Qqr<@o(dQJ}J1oPB61xDEf|Yxex#Rl+gIJ(EeTFnrDP-d-3x=;kFIJJ$~W-
zjlx6h_}kPcXP*;Z-XxqESn}laMOOnN#h}QzS!CZLTJr+l-zw_cCfc@Lv?nOqKZJ8T
zM92SDboP6q%R5CgyKwRE3YUip6)zSVzh7v7sqoRi-@bnL_JKXycl}`dzTy=J_ii8g
z;r3Jiuzma=w@(&_u`gKiqhM(ysC^~aR3df#IQZ!Q2(EuMIIusst0cTH8a#L)IP#O=
zss9xmze_s#`VbBdmHhKiX>3S)*WRX|4!M3d^yt3~t$%Z9K(cbz&xiKKhYlVd8hLBz
zlw|Mtk)g?>J20}N<n0}$OYhSj+tKuk9j<qFJo?KW>yH<0`&H4NWYPW;MTdUPY5v!u
zv+osMK3Oz#YO&~jPJ3#x;`CzUZx-8syLin9obEG=`_3-jc5d;W^y2;JIbZ$V;^P+<
zpB-I%d2I2_IOiJ|mn_dLQCwPL{BVi=BhI%zUb6O+C4HBdZ2NS{o-F4(SC$<5Y{~J@
zmz=%2<njb(!(_4OTJiE+v0|#&D7pCjjp8+#iM7+keKX=GZx!!*X5!%N&XL=Luie=>
zzM|x{`2k>{<h}9Ird2yn9$b3k+{ax1^JDAf6FU#{CZF#gvAC9v@@@?bl$7vz9M0@W
zQ^^nJm6MzUPn%wu41YSkIzIft)uYdt-nj7E=aVoEk6xYcGaVaMUz>jMt+zPE7QgBA
z=xaBo-xwagI<X0Zm&Vk$rcWOkzBzI9In&gL_RjRw(c!s?`2lmmxP~)X(sV>HX$hE@
zj=#>I(ZBr`zhu*9bJ;~r;mqp9k$WePayIKPzP@<osbfd(pPb)jwq`7nzTLkVm0z<3
z`ya?0;fdaSXH;`->yG|!Ub0C0&O9$OTzmI>{olQG^!~o-cSkMbu3i0GKD5X-EIU4G
z%h^JG_m&^6++g_Cs57_q`~5%us8LY(P;&I4+`E6@|Fe&dsyFmLAL*GA?&*K$V+*-q
z_pe7EA73Bt|Lw;|>o&akucPaxw*Ij1ez}F(aOS<y-l=#0vH!*=M?{6wCr3A37w&tO
zf7wF&mz|p1cHJ2nUUvCpTi*2kZOPp`|7lqMsgub+oSNHn{Uq-l!>1>^@*7Xj?YmLD
zf7teRGn?Q0o4NfroYCQjvL_#${K^A!2XE~B$?)S>obLSj4{qOk&&k(@d#{}QdVcoI
z+{n#h{ySSgb9!c1o}D{>(-|Ay{r2;JJ!?8Qck1TOpANtJxzjuOSMIs9H&6b{@SC5X
zd~$Z<`P<TE#fOI9z3TMM?)}}|<!NX9&bg~&y&I2?-hXX+_ggbYP0z-SV`KN<nm%*n
z&aH{jB}4P$3*W9~BXfn5J%jzL29FiaJd~JQF*&xi-;!B%*Z9mY=9JfZhWdLhty(d2
z=9e=~y0M-8n?79i<y*^+&$Z{iUb5rmk5(yfJ(QgLYHn<I|Iy7>-L2ifp8M8R&))vA
zPgd1mT>jqNhN-ca`{ysOYR)e^HDj;odF9zvpRQ`pKa`sL!S%6M?^v@IHvjT(8xP!A
zcVKeQ6Bc)V>`dd~8xybJ8T`yFU%BdB<B6N=-njG1=N9j5k63X2=ER$Kj$O6*W?#P0
zcqQ?7hwogPu=r=k#u{g*Cyq|;UvCNAUUkv3XlC89xw30!)qOpeEGuUw-kr1NETP*k
zFA}P5tvfOI*pwxFdrYv-lz9K-+~9RX<j$%O);aR)PEQ{Cnjw0p=ZfXwkN2LLJ9e|-
z+QN<AxO(HJ-#z!q^dq_-4NPo$c6RP-W5SuQedmb0WUa@%>GqiXR?msK$H&+G!|k~h
z#kPxv({rzKpPL^kdhWAjM++9-vh_XMrSBKq;5@slr?2=TcV=YH)w6%uMEEg<wZPH+
zv-f*#7vxVL6`yfluzk1Tw@;ZKdHlP>XL@@^@Jz$G-p4Pz`poe8-gOt=?KNX-?}bl#
zhsSz1jSBl3F7|F6Dd`)&)Vp)kwxQvp-rb{*Z+MHd_2tp6{)SI`UmbnbKYXS4^^wwz
z4WIYEIr_=Q;fda(W5WK1Ytz3P)AV1Mntp%G`RpS%CW}_Bd-lTg^u@8A&pmQ$`qQ!3
zadCF~+StiWkKCEQH8!#7!u(`OY4O0<IWvXhnpJyekB;}umj%j#WudZg!HZ?lvRGNX
zEK!y$OO>U|GG*C<y=D1wZn?NzQZ6l*l@sM;IaTmVIaAJ-yURW0-f~~LzdTSLEI3de
zE{~K)%VXv7@<e&EJXM}9c%wX9o-5B+a4W<Wk_u^stb!;wTtQXP6-)(N;jZvhcq@Dr
z{)%8ls3Keusfbp@D&iH1ie$lwigZP$B3qHG$X9YJ#g&puX~F4AqLQqnD(On5lC5-C
zdMdpI=PUh{fy!WIs4`p`sf<>}D&qy0DwCC|%5-I>GFzFe%*(lQalsY2R4$Vfa#Bvo
zX*nZj<?ezhxmWI!`{e<7P#%(p<q>(bU{)TNC*(<aN}iTy<XL%6p63-P#0rT*s*ouN
z1*xDEw1VL+Qn(c!g;(KI_!R*~P!Upu6;VY@5mzJ>NkvMLR%8@e-bzJY$yJJ#5~Wlr
zQxZy2N%6{*jFMHll^&&6=~McZ0cDV<QihcgWmFkc#+3<WQkhbwd3t45nN#KoE+Hl)
zgp`mG1kXfJ1WhmmOSlOS;U#>8pJye4M2H9z5h6;&h&YiTk~{~ICNe~p$PsxJS0z?S
zR8rn*6`>+kl!{g{DpuuIc~oBB!z#ZjpbDx&s<0}eimGC&xGJeisnV*9Dyzz=@>Sd_
zG4F{gX_c&ss3NPVD!PiPVyoP|r>eYFzAArJpek4ustQ*{s-nD&Rq?7sRkA8om9ENE
zWvg;kdEQ{PxLQ&zt(H|2)nqkQO;<C#p=x)vr`lWXtM*q1s)N;`>M-xc>S%SWI$oWq
zPFAO?)76>kEN^dhUd>gD)e^N-EmIR}Qcdw*Q8Q{*?N)o#UbRo{R|nKV-T`%39Z^Ts
zF?C#>P$$(Xby}TO=hS%(S0mO)G*XRBL+}o3C=IP)G_1z0@o2mnpT^HSrU`07ny@CK
ziE3h+xF(@V@=j>dnv5o^$!YRhu2!s-Xr;W<T0%=|DJ`vKw5-;x^=Q4k^IE?)pbct6
z+ORgFjcQ}sIPa1+sZD9q+Ke`<&1v&Gu1?ImqLb=mIzmV4C>^b1bga(Jo6>o8KAm3|
z&;@lNU04^<MR~KjxGtef>QcJ2E~Crpa=JVzCMBell#v8Uk`zgk41W>nCOxE=^pSot
zKnBSW8Rjn~qhySXlL<0OrpPpzA+!9IWM0qJi}ez{R4>yLdQwmE%k+$%)w}f`y;twk
z`}F~Rkgw8*^^vbXrjP1l`nW!!PwG?pG+(dJ>T~*h4Yx*IBdL+r$Z80_sfMbdYnU3g
z#$Ds7@z(fi{CsOouqIR!u8Gt{YhpF=nnX>K@2E-FWNNZCxte?}w^m#$sg>3ewPY<-
zOV={BY^}T2Q|skFT<fn5)COxqwc*-GZL~I48|Ob(o2*UMrfV~`+1gxfzK&Zb=08y<
zt&`Oebz~h?N7pfRY@M6`RGqiZSLd$_)CKE8b>X^5U6jAEE?$?YORk$oT?%z+)MZeY
zMO_YcdGH5e5W^sWK?;Km1_A~W1_}lm1_t~g7~C*;VDQ4=gTW6&0EQq8AsE8ozX(GV
zh8PTS7!oifVMxJ{h9Lt(7W}<1<YDB(D27o2qZCFNj0B7%j1-Ivj4X_97(Fn0Vf4Z1
zhcN(S5c~r$hGC4r7=<wgV;sf=j7b<%Fs8wO1I8?jIT-V(=b~PWdI{>KsF$Ih0RJ%R
zDb&-bXHd_g-i>+>>b<D<q23SvG1Lc9A3}W?^%2xZQ6EEn9Q6s*C&52~`ZVe@sL!H4
zhx$AyE+{c52`DM}r$G^*NKh0g8WaPH1?2|i0p$h%JSab?0H`3S5U4Py2&gEi7^pb-
zmp~;!r9h=YWk6*?<v`_O;=&{b{|Zb}m}D>!Fp)4(FwrnEFtISXVe-P{gUJt50Hz>J
zA(+B2MPQ18KMPYFrUXn$m{KsMVamXig((M99)bdx#V|`?mclH9nShyunSz;ynSo#t
z%x;)HFneM4!R&`Q0CN!L5X@l+mckr`IR<ka<^;@1m{TyPVa~vug<vJjc{Fg*AVz}(
z4brQ>m_Y-92694n2!b*+Flb=W;6{T74c>`6eQ5BbAu#!{5dsw&!jp#<_)#>(&=5yM
z0u4ztq^`ZV5UWQ+_S&@tejbh7oYRO#2^ytnl;wWC5NkpsHKm+EBZEeEYU?31deG=a
zqYsUKGzPDG`p_6gW90hCLSPJy@f%-SASWSkpfQcc3>vd&%-xtd1Pd1y@y!SOV39(w
z`sPoKu#m7&u+XqDu&}VWr@uD?ix+~2r$1ZZ2Ve=#Fbn)JED>0uGw&|oI0TQ~sxZQm
zf+c+`Fat{#mK-d3G;z@+hTw_(BZtr=Llcod+=nI#P4w&vBbrz=xgmH8O<pwl(Bz+;
znn6<#O`+S}3;YP0qPJgP;K$LFKvNP;DKw?gl(`dJh|QrXf9I1KG>g$JL9-OiGBgut
zCg&1|&`hJ5nJ?`_vm4Ex`R9yi_MzF2<^Y<5Xbzz{jOHksV`z?}If3RRnp0>_qd9}-
zg_GTj<~(RFXfbFBXenqJXaY0|nu6dJ&<to6v>UVsv=_7wv>$W;bP$3Apu?aeprfE;
zpyQwupp&3epwke%0Xhph2RaWc7gjN>5?H0M%3viRI1DQVD-A0HD+{X|Ru8ORSbebi
zAvgwW5Y`Z^VOS%uMq!P?8izFjYZ8JJu%=<nz?y|M2WuWJT(pSMB0-B3g41Xr&_bex
zLJN%+1}!XF+-UKj#S6iCwD{2yKuZuUA+&_i5<yE8Eitsj(UL?<3N2~0WYCgDOAal0
zv~tlZhTsZXrD&C*l|U<rRtl{&S{bymXmvv{g;p<GeQ5QgHGtM2T0>|JqcwuoC<L=;
zjiWVz)+Aa}XicLvgVrosb7;*&Sb#P$+9YU`qD_W20&OJPD74XNV<23FHaFTlX!D}Y
zhc-Xj0%!}OErhl(giFyDMOzGQakM4SmPA_$ZE3V+(3XX8WzRrc-o~|wZ4#T*CbJPX
z(nbl(Y>bVyxosYs*XFbNZ2?<QsIrA^5nI$2v&C%*Thf-YrES@TKh3<IYZu!kcBx%v
zCxj+DWvA_oowd8|9=q4>v-^csd(a-ThwTx2)E=|P?FoBQ=&+~l8GF{Av*+8n?c#Px
zyHvQkooFZ9sdl=ZX=mHr?Vfh8@ZolUd!Rko9%>J_N7|$9vG%y|vG!zpsy*GFY0tLj
z+Vc#T5euJSq>PLq7?PnFnqe50aSNYfyo`_WGXW;ZgqScBVWPr~Oq@wDNhZalnGBO<
za!lSKc1Rpjhs;4ZNC)Mh9gJ|u;dXc&UWd=&cLW?kN5~NtzUYWLVve{Y;Yd1Cj<h4=
z$O`v5@*Ug`afhTs+9B&8I>-)6_(})U!FISiJRRN+Ux&XV&=C|K=m>X2I-(u1j(A6+
zBiWJaNDJTS$ads9@}1mHai^qH+9~TKgoit+PP&unWINrRo=$J4uhTC))*0*!b%r}5
zozc!%XS_4fnG~MrOm}8Fvz@uld>6M%+$HIfb`f1<7u7{~F<oqzyUWw%6`t?%cLlnF
zU7@aUSEMW273+!%FLfomQeEk;Ojoum*Ohm2onqk?r_?EP5>C=dIcX>3WSwr|l+)|<
zIsMLnGw2LC!_J5^Dx7u3oe5{snR2F`8E4j+bLK?_F0o7ElDcFr!bQ3$7wuw1i(GD(
z$K`eTTz*%;6?BDMVbM}o)D?5ZT?tpxm2#zB8CO=c(v|P#c8j|u-O_GZH_=UYQ{7BA
z+wJc5bbGsf-Tv-CcTlA24tGboqusIYcz2>Z*`4Z6i}c;u?p$}C<+5T{!b({gONdM?
z#nLRpvaFl+uwK^3`bAbY$cET38)2htjE%DiHYswjX*R=V*&Lf+&0Q^CEm<uU3q*nv
zESLPja+W{1U{S(Z{-jta;w;z%e*y3pz&{ZZ-hu}US8fTU1-8zwh%3Z=(T8_0Ogruq
z{p#-jJ>vNHvBkoO;}25{Q9;SHPFl2J)-L~zw$!Bf@NU@(QIUkhkrsV>L(kLS`t~=U
zdiopRUSA3Yw2+BJg?~ia5AUY_n4kJ@^EV9?**P9Qsn)9XqQ6<_Wnh3O<Dy6**9rpH
L3ksa9b=kiG`?|7<

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index fe143e66407..933cf8849ec 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1975,8 +1975,14 @@ def test_row_group_alignment(datadir):
 @pytest.mark.parametrize(
     "inputfile",
     [
+        # These sample data have a single column my_timestamp of the TIMESTAMP type,
+        # 2660 rows, and 1536 rows per row group.
         "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
         "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+        # These two data are the same with the above, except that every 100 rows start
+        # with a null value.
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc",
     ],
 )
 def test_orc_reader_desynced_timestamp(datadir, inputfile):

From fdd4255f36b7c23c611d55440f424ab6ba500bac Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 Jan 2025 08:47:23 -0500
Subject: [PATCH 06/12] Add seed parameter to hash_character_ngrams (#17643)

Adds a seed parameter to the `nvtext::hash_character_ngrams` API.
Makes this more useful in conjunction with other nvtext APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17643
---
 cpp/include/nvtext/detail/generate_ngrams.hpp | 39 -------------------
 cpp/include/nvtext/generate_ngrams.hpp        |  4 +-
 cpp/src/text/generate_ngrams.cu               | 13 ++++---
 cpp/tests/streams/text/ngrams_test.cpp        |  4 +-
 cpp/tests/text/ngrams_tests.cpp               | 13 ++++++-
 5 files changed, 25 insertions(+), 48 deletions(-)
 delete mode 100644 cpp/include/nvtext/detail/generate_ngrams.hpp

diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
deleted file mode 100644
index ae48fed4e79..00000000000
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <nvtext/generate_ngrams.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace CUDF_EXPORT nvtext {
-namespace detail {
-
-/**
- * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for allocating/copying device memory and launching kernels
- */
-std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
-                                                    cudf::size_type ngrams,
-                                                    rmm::cuda_stream_view stream,
-                                                    rmm::device_async_resource_ref mr);
-
-}  // namespace detail
-}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 54282b8ef3c..b2ba1798a8f 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -117,6 +117,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  *
  * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param seed The seed value to use with the hash algorithm. Default is 0.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
@@ -124,6 +125,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 5,
+  uint32_t seed                     = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 997b0278fe2..33d52ccd570 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
+#include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -315,6 +315,7 @@ namespace {
  */
 CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
                                              cudf::size_type ngrams,
+                                             uint32_t seed,
                                              cudf::size_type const* d_ngram_offsets,
                                              cudf::hash_value_type* d_results)
 {
@@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
   __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
 
   auto const ngram_offset = d_ngram_offsets[str_idx];
-  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{seed};
 
   auto const end        = d_str.data() + d_str.size_bytes();
   auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
@@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
@@ -400,7 +402,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
   character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, ngrams, d_offsets, d_hashes);
+    *d_strings, ngrams, seed, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);
@@ -419,11 +421,12 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    uint32_t seed,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
+  return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
index 221c0a62f3e..47b9ac46d12 100644
--- a/cpp/tests/streams/text/ngrams_test.cpp
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams)
   auto input =
     cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
   nvtext::hash_character_ngrams(
-    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+    cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream());
 }
 
 TEST_F(TextNGramsTest, NgramsTokenize)
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c72c7cfc80e..1a737231389 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash)
                     2319357747u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::hash_character_ngrams(view, 10, 10);
+  // clang-format off
+  LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u,
+                     2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u},
+                 LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u,
+                     3759561568u, 1092423314u,  339538635u, 4265577390u,  879551618u,
+                     4222824617u, 1774528854u, 1028254379u,  485918316u,  879142987u, 3619248543u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(TextGenerateNgramsTest, NgramsHashErrors)

From 4ec389b4c515e4b3b85d6fd28b1471b1f1de830d Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 13 Jan 2025 09:27:13 -0800
Subject: [PATCH 07/12] Implement `HOST_UDF` aggregation for reduction and
 segmented reduction (#17645)

Following https://github.com/rapidsai/cudf/pull/17592, this enables `HOST_UDF` aggregation in reduction and segmented reduction, allowing to execute a host-side user-defined function (UDF) through libcudf aggregation framework.

Closes https://github.com/rapidsai/cudf/issues/16633.

Authors:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Yunsong Wang (https://github.com/PointKernel)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/17645
---
 cpp/include/cudf/aggregation.hpp              |   4 +-
 cpp/include/cudf/aggregation/host_udf.hpp     | 478 +++++++++++-------
 .../cudf/detail/aggregation/aggregation.hpp   |   6 +-
 cpp/src/groupby/groupby.cu                    |   9 +-
 cpp/src/groupby/sort/aggregate.cpp            |  81 ++-
 cpp/src/groupby/sort/host_udf_aggregation.cpp |  48 +-
 cpp/src/reductions/reductions.cpp             |  16 +-
 cpp/src/reductions/segmented/reductions.cpp   |  17 +-
 cpp/tests/CMakeLists.txt                      |   3 +-
 cpp/tests/groupby/host_udf_example_tests.cu   |  75 +--
 cpp/tests/groupby/host_udf_tests.cpp          | 245 ++++-----
 .../reductions/host_udf_example_tests.cu      | 422 ++++++++++++++++
 .../main/java/ai/rapids/cudf/Aggregation.java |   2 +-
 .../ai/rapids/cudf/GroupByAggregation.java    |   2 +-
 .../ai/rapids/cudf/ReductionAggregation.java  |  17 +-
 .../cudf/SegmentedReductionAggregation.java   |  11 +-
 16 files changed, 941 insertions(+), 495 deletions(-)
 create mode 100644 cpp/tests/reductions/host_udf_example_tests.cu

diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a1b7db5e08a..2b2a660bed7 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -601,7 +601,7 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            data_type output_type);
 
 // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
-struct host_udf_base;
+class host_udf_base;
 
 /**
  * @brief Factory to create a HOST_UDF aggregation.
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
index bbce76dc5f3..451d75137e4 100644
--- a/cpp/include/cudf/aggregation/host_udf.hpp
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,18 +17,16 @@
 #pragma once
 
 #include <cudf/aggregation.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <functional>
 #include <optional>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
 
 /**
  * @file host_udf.hpp
@@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf {
  */
 
 /**
- * @brief The interface for host-based UDF implementation.
+ * @brief The fundamental interface for host-based UDF implementation.
  *
- * An implementation of host-based UDF needs to be derived from this base class, defining
- * its own version of the required functions. In particular:
- *  - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
- *    and `clone` functions.
- *  - If necessary, the derived class can also override `do_hash` to compute hashing for its
- *    instance, and `get_required_data` to selectively access to the input data as well as
- *    intermediate data provided by libcudf.
+ * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in
+ * the users' UDF implementation. These functions are required for libcudf aggregation framework
+ * to perform its operations.
+ */
+class host_udf_base {
+  // Declare constructor private to prevent the users from deriving from this class.
+ private:
+  host_udf_base() = default;  ///< Default constructor
+
+  // Only allow deriving from the structs below.
+  friend struct reduce_host_udf;
+  friend struct segmented_reduce_host_udf;
+  friend struct groupby_host_udf;
+
+ public:
+  virtual ~host_udf_base() = default;  ///< Default destructor
+
+  /**
+   * @brief Computes hash value of the instance.
+   *
+   * Overriding this function is optional when the derived class has data members such that
+   * each instance needs to be differentiated from each other.
+   *
+   * @return The hash value of the instance
+   */
+  [[nodiscard]] virtual std::size_t do_hash() const
+  {
+    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+  }
+
+  /**
+   * @brief Compares two instances of the derived class for equality.
+   * @param other The other instance to compare with
+   * @return True if the two instances are equal
+   */
+  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+  /**
+   * @brief Clones the instance.
+   *
+   * The instances of the derived class should be lightweight for efficient cloning.
+   *
+   * @return A new instance cloned from this one
+   */
+  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for reduction contexts.
+ *
+ * An implementation of host-based UDF for reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived classes must also define the `operator()` function to perform reduction
+ * operations.
  *
- * Example of such implementation:
+ * Example:
  * @code{.cpp}
- * struct my_udf_aggregation : cudf::host_udf_base {
+ * struct my_udf_aggregation : cudf::reduce_host_udf {
  *   my_udf_aggregation() = default;
  *
- *   // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
- *   // and the result from groupby `MAX` aggregation.
- *   [[nodiscard]] data_attribute_set_t get_required_data() const override
+ *   [[nodiscard]] std::unique_ptr<scalar> operator()(
+ *     column_view const& input,
+ *     data_type output_dtype,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
  *   {
- *       return {groupby_data_attribute::GROUPED_VALUES,
- *               groupby_data_attribute::GROUP_OFFSETS,
- *               cudf::make_max_aggregation<cudf::groupby_aggregation>()};
+ *     // Perform reduction computation using the input data and return the reduction result.
+ *     // This is where the actual reduction logic is implemented.
  *   }
  *
- *   [[nodiscard]] output_t get_empty_output(
- *     [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
- *     [[maybe_unused]] rmm::cuda_stream_view stream,
- *     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
- *     // This UDF aggregation always returns a column of type INT32.
- *     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
- *   [[nodiscard]] output_t operator()(input_map_t const& input,
- *                                     rmm::cuda_stream_view stream,
- *                                     rmm::device_async_resource_ref mr) const override
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
  *   {
- *     // Perform UDF computation using the input data and return the result.
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct reduce_host_udf : host_udf_base {
+  /**
+   * @brief Perform reduction operations.
+   *
+   * @param input The input column for reduction
+   * @param output_dtype The data type for the final output scalar
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
+   */
+  [[nodiscard]] virtual std::unique_ptr<scalar> operator()(
+    column_view const& input,
+    data_type output_dtype,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief The interface for host-based UDF implementation for segmented reduction context.
+ *
+ * An implementation of host-based UDF for segmented reduction needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such derived class must also define the `operator()` function to perform segmented reduction.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::segmented_reduce_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     column_view const& input,
+ *     device_span<size_type const> offsets,
+ *     data_type output_dtype,
+ *     null_policy null_handling,
+ *     std::optional<std::reference_wrapper<scalar const>> init,
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform computation using the input data and return the result.
+ *     // This is where the actual segmented reduction logic is implemented.
  *   }
  *
  *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
  *   {
  *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
  *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
  *   }
  *
@@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf {
  * };
  * @endcode
  */
-struct host_udf_base {
-  host_udf_base()          = default;
-  virtual ~host_udf_base() = default;
-
+struct segmented_reduce_host_udf : host_udf_base {
   /**
-   * @brief Define the possible data needed for groupby aggregations.
+   * @brief Perform segmented reduction operations.
    *
-   * Note that only sort-based groupby aggregations are supported.
+   * @param input The input column for reduction
+   * @param offsets A list of offsets defining the segments for reduction
+   * @param output_dtype The data type for the final output column
+   * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in
+   *        the segment are valid, and if `EXCLUDE` then the reduction result is valid if any
+   *        element in the segment is valid
+   * @param init The initial value of the reduction
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  enum class groupby_data_attribute : int32_t {
-    INPUT_VALUES,    ///< The input values column.
-    GROUPED_VALUES,  ///< The input values grouped according to the input `keys` for which the
-                     ///< values within each group maintain their original order.
-    SORTED_GROUPED_VALUES,  ///< The input values grouped according to the input `keys` and
-                            ///< sorted within each group.
-    NUM_GROUPS,             ///< The number of groups (i.e., number of distinct keys).
-    GROUP_OFFSETS,          ///< The offsets separating groups.
-    GROUP_LABELS            ///< Group labels (which is also the same as group indices).
-  };
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    column_view const& input,
+    device_span<size_type const> offsets,
+    data_type output_dtype,
+    null_policy null_handling,
+    std::optional<std::reference_wrapper<scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const = 0;
+};
 
+// Forward declaration.
+namespace groupby ::detail {
+struct aggregate_result_functor;
+}
+
+/**
+ * @brief The interface for host-based UDF implementation for groupby aggregation context.
+ *
+ * An implementation of host-based UDF for groupby needs to be derived from this class.
+ * In addition to implementing the virtual functions declared in the base class `host_udf_base`,
+ * such a derived class must also define the functions `get_empty_output()` to return result when
+ * the input is empty, and ``operator()`` to perform its groupby operations.
+ *
+ * During execution, the derived class can access internal data provided by the libcudf groupby
+ * framework through a set of ``get*`` accessors, as well as calling other built-in groupby
+ * aggregations through the ``compute_aggregation`` function.
+ *
+ * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby
+ * aggregations require more complex data structure and is not yet supported.
+ *
+ * Example:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::groupby_host_udf {
+ *   my_udf_aggregation() = default;
+ *
+ *   [[nodiscard]] std::unique_ptr<column> get_empty_output(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Return a column corresponding to the result when the input values column is empty.
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<column> operator()(
+ *     rmm::cuda_stream_view stream,
+ *     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform UDF computation using the input data and return the result.
+ *   }
+ *
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ *   {
+ *     // Check if the other object is also instance of this class.
+ *     // If there are internal state variables, they may need to be checked for equality as well.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+ *   {
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct groupby_host_udf : host_udf_base {
   /**
-   * @brief Describe possible data that may be needed in the derived class for its operations.
+   * @brief Get the output when the input values column is empty.
    *
-   * Such data can be either intermediate data such as sorted values or group labels etc, or the
-   * results of other aggregations.
+   * This is called in libcudf when the input values column is empty. In such situations libcudf
+   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
    *
-   * Each derived host-based UDF class may need a different set of data. It is inefficient to
-   * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
-   * the derived class can define a subset of data that it needs and libcudf will evaluate
-   * and pass down only data requested from that set.
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation when the input values column is empty
    */
-  struct data_attribute {
-    /**
-     * @brief Hold all possible data types for the input of the aggregation in the derived class.
-     */
-    using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
-    value_type value;  ///< The actual data attribute, wrapped by this struct
-                       ///< as a wrapper is needed to define `hash` and `equal_to` functors.
-
-    data_attribute()                 = default;  ///< Default constructor
-    data_attribute(data_attribute&&) = default;  ///< Move constructor
-
-    /**
-     * @brief Construct a new data attribute from an aggregation attribute.
-     * @param value_ An aggregation attribute
-     */
-    template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
-    data_attribute(T value_) : value{value_}
-    {
-    }
-
-    /**
-     * @brief Construct a new data attribute from another aggregation request.
-     * @param value_ An aggregation request
-     */
-    template <typename T,
-              CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
-                             std::is_same_v<T, groupby_aggregation>)>
-    data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
-    {
-      CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
-                   "Invalid aggregation request.");
-      if constexpr (std::is_same_v<T, aggregation>) {
-        CUDF_EXPECTS(
-          dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
-          "Requesting results from other aggregations is only supported in groupby "
-          "aggregations.");
-      }
-    }
-
-    /**
-     * @brief Copy constructor.
-     * @param other The other data attribute to copy from
-     */
-    data_attribute(data_attribute const& other);
-
-    /**
-     * @brief Hash functor for `data_attribute`.
-     */
-    struct hash {
-      /**
-       * @brief Compute the hash value of a data attribute.
-       * @param attr The data attribute to hash
-       * @return The hash value of the data attribute
-       */
-      std::size_t operator()(data_attribute const& attr) const;
-    };  // struct hash
-
-    /**
-     * @brief Equality comparison functor for `data_attribute`.
-     */
-    struct equal_to {
-      /**
-       * @brief Check if two data attributes are equal.
-       * @param lhs The left-hand side data attribute
-       * @param rhs The right-hand side data attribute
-       * @return True if the two data attributes are equal
-       */
-      bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
-    };  // struct equal_to
-  };    // struct data_attribute
+  [[nodiscard]] virtual std::unique_ptr<column> get_empty_output(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
 
   /**
-   * @brief Set of attributes for the input data that is needed for computing the aggregation.
+   * @brief Perform the main groupby computation for the host-based UDF.
+   *
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
    */
-  using data_attribute_set_t =
-    std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
+  [[nodiscard]] virtual std::unique_ptr<column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0;
+
+ private:
+  // Allow the struct `aggregate_result_functor` to set its private callback variables.
+  friend struct groupby::detail::aggregate_result_functor;
 
   /**
-   * @brief Return a set of attributes for the data that is needed for computing the aggregation.
-   *
-   * The derived class should return the attributes corresponding to only the data that it needs to
-   * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
-   * empty set is returned. That means all the data attributes (except results from other
-   * aggregations in groupby) will be needed.
-   *
-   * @return A set of `data_attribute`
+   * @brief Callback to access the input values column.
+   */
+  std::function<column_view(void)> callback_input_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys for which the
+   * values within each group maintain their original order.
+   */
+  std::function<column_view(void)> callback_grouped_values;
+
+  /**
+   * @brief Callback to access the input values grouped according to the input keys and sorted
+   * within each group.
+   */
+  std::function<column_view(void)> callback_sorted_grouped_values;
+
+  /**
+   * @brief Callback to access the number of groups (i.e., number of distinct keys).
    */
-  [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+  std::function<size_type(void)> callback_num_groups;
 
   /**
-   * @brief Hold all possible types of the data that is passed to the derived class for executing
-   * the aggregation.
+   * @brief Callback to access the offsets separating groups.
    */
-  using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
+  std::function<device_span<size_type const>(void)> callback_group_offsets;
 
   /**
-   * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+   * @brief Callback to access the group labels (which is also the same as group indices).
    */
-  using input_map_t = std::
-    unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
+  std::function<device_span<size_type const>(void)> callback_group_labels;
 
   /**
-   * @brief Output type of the aggregation.
+   * @brief Callback to access the result from other groupby aggregations.
+   */
+  std::function<column_view(std::unique_ptr<aggregation>)> callback_compute_aggregation;
+
+ protected:
+  /**
+   * @brief Access the input values column.
    *
-   * Currently only a single type is supported as the output of the aggregation, but it will hold
-   * more type in the future when reduction is supported.
+   * @return The input values column.
    */
-  using output_t = std::variant<std::unique_ptr<column>>;
+  [[nodiscard]] column_view get_input_values() const
+  {
+    CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values.");
+    return callback_input_values();
+  }
 
   /**
-   * @brief Get the output when the input values column is empty.
+   * @brief Access the input values grouped according to the input keys for which the values
+   * within each group maintain their original order.
    *
-   * This is called in libcudf when the input values column is empty. In such situations libcudf
-   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+   * @return The grouped values column.
+   */
+  [[nodiscard]] column_view get_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values.");
+    return callback_grouped_values();
+  }
+
+  /**
+   * @brief Access the input values grouped according to the input keys and sorted within each
+   * group.
    *
-   * @param output_dtype The expected output data type
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation when input values is empty
+   * @return The sorted grouped values column.
    */
-  [[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] column_view get_sorted_grouped_values() const
+  {
+    CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values.");
+    return callback_sorted_grouped_values();
+  }
 
   /**
-   * @brief Perform the main computation for the host-based UDF.
+   * @brief Access the number of groups (i.e., number of distinct keys).
    *
-   * @param input The input data needed for performing all computation
-   * @param stream The CUDA stream to use for any kernel launches
-   * @param mr Device memory resource to use for any allocations
-   * @return The output result of the aggregation
+   * @return The number of groups.
    */
-  [[nodiscard]] virtual output_t operator()(input_map_t const& input,
-                                            rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) const = 0;
+  [[nodiscard]] size_type get_num_groups() const
+  {
+    CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups.");
+    return callback_num_groups();
+  }
 
   /**
-   * @brief Computes hash value of the class's instance.
-   * @return The hash value of the instance
+   * @brief Access the offsets separating groups.
+   *
+   * @return The array of group offsets.
    */
-  [[nodiscard]] virtual std::size_t do_hash() const
+  [[nodiscard]] device_span<size_type const> get_group_offsets() const
   {
-    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+    CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets.");
+    return callback_group_offsets();
   }
 
   /**
-   * @brief Compares two instances of the derived class for equality.
-   * @param other The other derived class's instance to compare with
-   * @return True if the two instances are equal
+   * @brief Access the group labels (which is also the same as group indices).
+   *
+   * @return The array of group labels.
    */
-  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+  [[nodiscard]] device_span<size_type const> get_group_labels() const
+  {
+    CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels.");
+    return callback_group_labels();
+  }
 
   /**
-   * @brief Clones the instance.
+   * @brief Compute a built-in groupby aggregation and access its result.
    *
-   * A class derived from `host_udf_base` should not store too much data such that its instances
-   * remain lightweight for efficient cloning.
+   * This allows the derived class to call any other built-in groupby aggregations on the same input
+   * values column and access the output for its operations.
    *
-   * @return A new instance cloned from this
+   * @param other_agg An arbitrary built-in groupby aggregation
+   * @return A `column_view` object corresponding to the output result of the given aggregation
    */
-  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+  [[nodiscard]] column_view compute_aggregation(std::unique_ptr<aggregation> other_agg) const
+  {
+    CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation.");
+    return callback_compute_aggregation(std::move(other_agg));
+  }
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index d873e93bd20..5574ed6ea6e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation {
 /**
  * @brief Derived class for specifying host-based UDF aggregation.
  */
-class host_udf_aggregation final : public groupby_aggregation {
+class host_udf_aggregation final : public groupby_aggregation,
+                                   public reduce_aggregation,
+                                   public segmented_reduce_aggregation {
  public:
   std::unique_ptr<host_udf_base> udf_ptr;
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 4c90cd0eef5..6234148e9fa 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,8 +145,11 @@ struct empty_column_constructor {
     }
 
     if constexpr (k == aggregation::Kind::HOST_UDF) {
-      auto const& udf_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-      return std::get<std::unique_ptr<column>>(udf_ptr->get_empty_output(std::nullopt, stream, mr));
+      auto const& udf_base_ptr =
+        dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+      auto const udf_ptr = dynamic_cast<groupby_host_udf const*>(udf_base_ptr.get());
+      CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
+      return udf_ptr->get_empty_output(stream, mr);
     }
 
     return make_empty_column(target_type(values.type(), k));
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 6480070e85a..fb3f7559d64 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -795,58 +795,41 @@ void aggregate_result_functor::operator()<aggregation::HOST_UDF>(aggregation con
 {
   if (cache.has_result(values, agg)) { return; }
 
-  auto const& udf_ptr   = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
-  auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t {
-    if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; }
-    // Empty attribute set means everything.
-    return {host_udf_base::groupby_data_attribute::INPUT_VALUES,
-            host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-            host_udf_base::groupby_data_attribute::NUM_GROUPS,
-            host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-            host_udf_base::groupby_data_attribute::GROUP_LABELS};
-  }();
+  auto const& udf_base_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+  auto const udf_ptr       = dynamic_cast<groupby_host_udf*>(udf_base_ptr.get());
+  CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation.");
 
-  // Do not cache udf_input, as the actual input data may change from run to run.
-  host_udf_base::input_map_t udf_input;
-  for (auto const& attr : data_attrs) {
-    CUDF_EXPECTS(std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value) ||
-                   std::holds_alternative<std::unique_ptr<aggregation>>(attr.value),
-                 "Invalid input data attribute for HOST_UDF groupby aggregation.");
-    if (std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value)) {
-      switch (std::get<host_udf_base::groupby_data_attribute>(attr.value)) {
-        case host_udf_base::groupby_data_attribute::INPUT_VALUES:
-          udf_input.emplace(attr, values);
-          break;
-        case host_udf_base::groupby_data_attribute::GROUPED_VALUES:
-          udf_input.emplace(attr, get_grouped_values());
-          break;
-        case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES:
-          udf_input.emplace(attr, get_sorted_values());
-          break;
-        case host_udf_base::groupby_data_attribute::NUM_GROUPS:
-          udf_input.emplace(attr, helper.num_groups(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_OFFSETS:
-          udf_input.emplace(attr, helper.group_offsets(stream));
-          break;
-        case host_udf_base::groupby_data_attribute::GROUP_LABELS:
-          udf_input.emplace(attr, helper.group_labels(stream));
-          break;
-        default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation.");
-      }
-    } else {  // data is result from another aggregation
-      auto other_agg = std::get<std::unique_ptr<aggregation>>(attr.value)->clone();
+  if (!udf_ptr->callback_input_values) {
+    udf_ptr->callback_input_values = [&]() -> column_view { return values; };
+  }
+  if (!udf_ptr->callback_grouped_values) {
+    udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); };
+  }
+  if (!udf_ptr->callback_sorted_grouped_values) {
+    udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); };
+  }
+  if (!udf_ptr->callback_num_groups) {
+    udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); };
+  }
+  if (!udf_ptr->callback_group_offsets) {
+    udf_ptr->callback_group_offsets = [&]() -> device_span<size_type const> {
+      return helper.group_offsets(stream);
+    };
+  }
+  if (!udf_ptr->callback_group_labels) {
+    udf_ptr->callback_group_labels = [&]() -> device_span<size_type const> {
+      return helper.group_labels(stream);
+    };
+  }
+  if (!udf_ptr->callback_compute_aggregation) {
+    udf_ptr->callback_compute_aggregation =
+      [&](std::unique_ptr<aggregation> other_agg) -> column_view {
       cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg);
-      auto result = cache.get_result(values, *other_agg);
-      udf_input.emplace(std::move(other_agg), std::move(result));
-    }
+      return cache.get_result(values, *other_agg);
+    };
   }
 
-  auto output = (*udf_ptr)(udf_input, stream, mr);
-  CUDF_EXPECTS(std::holds_alternative<std::unique_ptr<column>>(output),
-               "Invalid output type from HOST_UDF groupby aggregation.");
-  cache.add_result(values, agg, std::get<std::unique_ptr<column>>(std::move(output)));
+  cache.add_result(values, agg, (*udf_ptr)(stream, mr));
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp
index 0da47e17f48..6f1fe80c4bd 100644
--- a/cpp/src/groupby/sort/host_udf_aggregation.cpp
+++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,51 +16,9 @@
 
 #include <cudf/aggregation/host_udf.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
 
 namespace cudf {
 
-host_udf_base::data_attribute::data_attribute(data_attribute const& other)
-  : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; },
-                                                    [](std::unique_ptr<aggregation> const& val) {
-                                                      return value_type{val->clone()};
-                                                    }},
-                     other.value)}
-{
-}
-
-std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const
-{
-  auto const hash_value =
-    std::visit(cudf::detail::visitor_overload{
-                 [](auto const& val) { return std::hash<int>{}(static_cast<int>(val)); },
-                 [](std::unique_ptr<aggregation> const& val) { return val->do_hash(); }},
-               attr.value);
-  return std::hash<std::size_t>{}(attr.value.index()) ^ hash_value;
-}
-
-bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs,
-                                                         data_attribute const& rhs) const
-{
-  auto const& lhs_val = lhs.value;
-  auto const& rhs_val = rhs.value;
-  if (lhs_val.index() != rhs_val.index()) { return false; }
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [](auto const& lhs_val, auto const& rhs_val) {
-        if constexpr (std::is_same_v<decltype(lhs_val), decltype(rhs_val)>) {
-          return lhs_val == rhs_val;
-        } else {
-          return false;
-        }
-      },
-      [](std::unique_ptr<aggregation> const& lhs_val, std::unique_ptr<aggregation> const& rhs_val) {
-        return lhs_val->is_equal(*rhs_val);
-      }},
-    lhs_val,
-    rhs_val);
-}
-
 namespace detail {
 
 host_udf_aggregation::host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
@@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr<aggregation> make_host_udf_aggregation<aggr
   std::unique_ptr<host_udf_base>);
 template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
   make_host_udf_aggregation<groupby_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_host_udf_aggregation<reduce_aggregation>(std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
+  make_host_udf_aggregation<segmented_reduce_aggregation>(std::unique_ptr<host_udf_base>);
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 75ebc078930..928625a7e8f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/copy.hpp>
@@ -144,6 +145,13 @@ struct reduce_dispatch_functor {
         auto td_agg = static_cast<cudf::detail::merge_tdigest_aggregation const&>(agg);
         return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction.");
+        return (*udf_ptr)(col, output_dtype, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported reduction operator");
     }
   }
@@ -161,9 +169,11 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   // Returns default scalar if input column is empty or all null
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1c3a2b0c0f3..5835bfcf0a1 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor {
       }
       case segmented_reduce_aggregation::NUNIQUE:
         return segmented_nunique(col, offsets, null_handling, stream, mr);
+      case aggregation::HOST_UDF: {
+        auto const& udf_base_ptr =
+          dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+        auto const udf_ptr = dynamic_cast<segmented_reduce_host_udf const*>(udf_base_ptr.get());
+        CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction.");
+        return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr);
+      }  // case aggregation::HOST_UDF
       default: CUDF_FAIL("Unsupported aggregation type.");
     }
   }
@@ -117,9 +126,11 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
-                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL ||
+                            agg.kind == aggregation::HOST_UDF)) {
     CUDF_FAIL(
-      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
+      "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF "
+      "aggregation types");
   }
 
   if (segmented_values.is_empty() && offsets.empty()) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 344979e1288..35877ac34b9 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -220,11 +220,12 @@ ConfigureTest(
   REDUCTIONS_TEST
   reductions/collect_ops_tests.cpp
   reductions/ewm_tests.cpp
+  reductions/host_udf_example_tests.cu
+  reductions/list_rank_test.cpp
   reductions/rank_tests.cpp
   reductions/reduction_tests.cpp
   reductions/scan_tests.cpp
   reductions/segmented_reduction_tests.cpp
-  reductions/list_rank_test.cpp
   reductions/tdigest_tests.cu
   GPUS 1
   PERCENT 70
diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu
index a454bd692fc..e1ded37d8a7 100644
--- a/cpp/tests/groupby/host_udf_example_tests.cu
+++ b/cpp/tests/groupby/host_udf_example_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/reduction.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -34,6 +32,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+
 namespace {
 /**
  * @brief A host-based UDF implementation for groupby.
@@ -41,42 +42,21 @@ namespace {
  * For each group of values, the aggregation computes
  * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`.
  */
-struct host_udf_groupby_example : cudf::host_udf_base {
+struct host_udf_groupby_example : cudf::groupby_host_udf {
   host_udf_groupby_example() = default;
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override
-  {
-    // We need grouped values, group offsets, group labels, and also results from groups'
-    // MAX and SUM aggregations.
-    return {groupby_data_attribute::GROUPED_VALUES,
-            groupby_data_attribute::GROUP_OFFSETS,
-            groupby_data_attribute::GROUP_LABELS,
-            cudf::make_max_aggregation<cudf::groupby_aggregation>(),
-            cudf::make_sum_aggregation<cudf::groupby_aggregation>()};
-  }
-
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
-    [[maybe_unused]] rmm::cuda_stream_view stream,
-    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
+    rmm::cuda_stream_view, rmm::device_async_resource_ref) const override
   {
     return cudf::make_empty_column(
       cudf::data_type{cudf::type_to_id<typename groupby_fn::OutputType>()});
   }
 
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
   {
-    auto const& values =
-      std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-    return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr);
-  }
-
-  [[nodiscard]] std::size_t do_hash() const override
-  {
-    // Just return the same hash for all instances of this class.
-    return std::size_t{12345};
+    auto const values = get_grouped_values();
+    return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr);
   }
 
   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
@@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
   struct groupby_fn {
     // Store pointer to the parent class so we can call its functions.
-    host_udf_groupby_example const* parent;
+    host_udf_groupby_example const& parent;
 
-    // For simplicity, this example only accepts double input and always produces double output.
+    // For simplicity, this example only accepts a single type input and output.
     using InputType  = double;
     using OutputType = double;
 
     template <typename T, typename... Args, CUDF_ENABLE_IF(!std::is_same_v<InputType, T>)>
-    output_t operator()(Args...) const
+    std::unique_ptr<cudf::column> operator()(Args...) const
     {
       CUDF_FAIL("Unsupported input type.");
     }
 
     template <typename T, CUDF_ENABLE_IF(std::is_same_v<InputType, T>)>
-    output_t operator()(input_map_t const& input,
-                        rmm::cuda_stream_view stream,
-                        rmm::device_async_resource_ref mr) const
+    std::unique_ptr<cudf::column> operator()(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr) const
     {
-      auto const& values =
-        std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
-      if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); }
+      auto const values = parent.get_grouped_values();
+      if (values.size() == 0) { return parent.get_empty_output(stream, mr); }
 
-      auto const offsets = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_OFFSETS));
+      auto const offsets = parent.get_group_offsets();
       CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
       auto const num_groups    = static_cast<int>(offsets.size()) - 1;
-      auto const group_indices = std::get<cudf::device_span<cudf::size_type const>>(
-        input.at(groupby_data_attribute::GROUP_LABELS));
-      auto const group_max = std::get<cudf::column_view>(
-        input.at(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
-      auto const group_sum = std::get<cudf::column_view>(
-        input.at(cudf::make_sum_aggregation<cudf::groupby_aggregation>()));
+      auto const group_indices = parent.get_group_labels();
+      auto const group_max =
+        parent.compute_aggregation(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      auto const group_sum =
+        parent.compute_aggregation(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
       auto const values_dv_ptr = cudf::column_device_view::create(values, stream);
       auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<OutputType>()},
@@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base {
 
 }  // namespace
 
-using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
-using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
-
 struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {};
 
 TEST_F(HostUDFGroupbyExampleTest, SimpleInput)
diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp
index 1a0f68c0c6c..17da28cdefc 100644
--- a/cpp/tests/groupby/host_udf_tests.cpp
+++ b/cpp/tests/groupby/host_udf_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,178 +26,121 @@
 #include <vector>
 
 namespace {
+
 /**
- * @brief A host-based UDF implementation used for unit tests.
+ * @brief Generate a random aggregation object from {min, max, sum, product}.
  */
-struct host_udf_test_base : cudf::host_udf_base {
+std::unique_ptr<cudf::aggregation> get_random_agg()
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> distr(1, 4);
+  switch (distr(gen)) {
+    case 1: return cudf::make_min_aggregation();
+    case 2: return cudf::make_max_aggregation();
+    case 3: return cudf::make_sum_aggregation();
+    case 4: return cudf::make_product_aggregation();
+    default: CUDF_UNREACHABLE("This should not be reached.");
+  }
+  return nullptr;
+}
+
+/**
+ * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
+ */
+struct host_udf_groupby_test : cudf::groupby_host_udf {
   int test_location_line;  // the location where testing is called
   bool* test_run;          // to check if the test is accidentally skipped
-  data_attribute_set_t input_attrs;
+  bool test_other_agg;     // test calling other aggregation
 
-  host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_)
-    : test_location_line{test_location_line_},
-      test_run{test_run_},
-      input_attrs(std::move(input_attrs_))
+  host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_)
+    : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_}
   {
   }
 
-  [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; }
-
-  // This is the main testing function, which checks for the correctness of input data.
-  // The rests are just to satisfy the interface.
-  [[nodiscard]] output_t operator()(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const override
+  [[nodiscard]] std::size_t do_hash() const override { return 0; }
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
   {
-    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
-
-    test_data_attributes(input, stream, mr);
-
-    *test_run = true;  // test is run successfully
-    return get_empty_output(std::nullopt, stream, mr);
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_groupby_test const*>(&other) != nullptr;
+  }
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, test_other_agg);
   }
 
-  [[nodiscard]] output_t get_empty_output(
-    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+  [[nodiscard]] std::unique_ptr<cudf::column> get_empty_output(
     [[maybe_unused]] rmm::cuda_stream_view stream,
     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
   {
-    // Unused function - dummy output.
+    // Dummy output.
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
   }
 
-  [[nodiscard]] std::size_t do_hash() const override { return 0; }
-  [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; }
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override
+  {
+    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
 
-  // The main test function, which must be implemented for each kind of aggregations
-  // (groupby/reduction/segmented_reduction).
-  virtual void test_data_attributes(input_map_t const& input,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::device_async_resource_ref mr) const = 0;
-};
+    // Perform tests on types for the groupby data: we must ensure the data corresponding to each
+    // `groupby_data` enum having the correct type.
 
-/**
- * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
- */
-struct host_udf_groupby_test : host_udf_test_base {
-  host_udf_groupby_test(int test_location_line_,
-                        bool* test_run_,
-                        data_attribute_set_t input_attrs_ = {})
-    : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_))
-  {
-  }
+    {
+      auto const inp_data = get_input_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
-  {
-    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, input_attrs);
-  }
+    {
+      auto const inp_data = get_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
 
-  void test_data_attributes(input_map_t const& input,
-                            rmm::cuda_stream_view stream,
-                            rmm::device_async_resource_ref mr) const override
-  {
-    data_attribute_set_t check_attrs = input_attrs;
-    if (check_attrs.empty()) {
-      check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES,
-                                         groupby_data_attribute::GROUPED_VALUES,
-                                         groupby_data_attribute::SORTED_GROUPED_VALUES,
-                                         groupby_data_attribute::NUM_GROUPS,
-                                         groupby_data_attribute::GROUP_OFFSETS,
-                                         groupby_data_attribute::GROUP_LABELS};
+    {
+      auto const inp_data = get_sorted_grouped_values();
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
     }
-    EXPECT_EQ(input.size(), check_attrs.size());
-    for (auto const& attr : check_attrs) {
-      EXPECT_TRUE(input.count(attr) > 0);
-      EXPECT_TRUE(std::holds_alternative<groupby_data_attribute>(attr.value) ||
-                  std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value));
-      if (std::holds_alternative<groupby_data_attribute>(attr.value)) {
-        switch (std::get<groupby_data_attribute>(attr.value)) {
-          case groupby_data_attribute::INPUT_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::SORTED_GROUPED_VALUES:
-            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-            break;
-          case groupby_data_attribute::NUM_GROUPS:
-            EXPECT_TRUE(std::holds_alternative<cudf::size_type>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_OFFSETS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          case groupby_data_attribute::GROUP_LABELS:
-            EXPECT_TRUE(
-              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
-            break;
-          default:;
-        }
-      } else {  // std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value)
-        EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
-      }
+
+    {
+      auto const inp_data = get_num_groups();
+      EXPECT_TRUE((std::is_same_v<cudf::size_type, std::decay_t<decltype(inp_data)>>));
     }
-  }
-};
 
-/**
- * @brief Get a random subset of input data attributes.
- */
-cudf::host_udf_base::data_attribute_set_t get_subset(
-  cudf::host_udf_base::data_attribute_set_t const& attrs)
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<std::size_t> size_distr(1, attrs.size() - 1);
-  auto const subset_size = size_distr(gen);
-  auto const elements =
-    std::vector<cudf::host_udf_base::data_attribute>(attrs.begin(), attrs.end());
-  std::uniform_int_distribution<std::size_t> idx_distr(0, attrs.size() - 1);
-  cudf::host_udf_base::data_attribute_set_t output;
-  while (output.size() < subset_size) {
-    output.insert(elements[idx_distr(gen)]);
-  }
-  return output;
-}
+    {
+      auto const inp_data = get_group_offsets();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
 
-/**
- * @brief Generate a random aggregation object from {min, max, sum, product}.
- */
-std::unique_ptr<cudf::aggregation> get_random_agg()
-{
-  std::random_device rd;
-  std::mt19937 gen(rd());
-  std::uniform_int_distribution<int> distr(1, 4);
-  switch (distr(gen)) {
-    case 1: return cudf::make_min_aggregation();
-    case 2: return cudf::make_max_aggregation();
-    case 3: return cudf::make_sum_aggregation();
-    case 4: return cudf::make_product_aggregation();
-    default: CUDF_UNREACHABLE("This should not be reached.");
+    {
+      auto const inp_data = get_group_labels();
+      EXPECT_TRUE((std::is_same_v<cudf::device_span<cudf::size_type const>,
+                                  std::decay_t<decltype(inp_data)>>));
+    }
+
+    // Perform tests on type of the result from computing other aggregations.
+    if (test_other_agg) {
+      auto const inp_data = compute_aggregation(get_random_agg());
+      EXPECT_TRUE((std::is_same_v<cudf::column_view, std::decay_t<decltype(inp_data)>>));
+    }
+
+    *test_run = true;  // test is run successfully
+    return get_empty_output(stream, mr);
   }
-  return nullptr;
-}
+};
 
 }  // namespace
 
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 
-// Number of randomly testing on the input data attributes.
-// For each test, a subset of data attributes will be randomly generated from all the possible input
-// data attributes. The input data corresponding to that subset passed from libcudf will be tested
-// for correctness.
-constexpr int NUM_RANDOM_TESTS = 20;
-
 struct HostUDFTest : cudf::test::BaseFixture {};
 
-TEST_F(HostUDFTest, GroupbyAllInput)
+TEST_F(HostUDFTest, GroupbyBuiltinInput)
 {
   bool test_run   = false;
   auto const keys = int32s_col{0, 1, 2};
   auto const vals = int32s_col{0, 1, 2};
   auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run));
+    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ false));
 
   std::vector<cudf::groupby::aggregation_request> requests;
   requests.emplace_back();
@@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput)
   requests[0].aggregations.push_back(std::move(agg));
   cudf::groupby::groupby gb_obj(
     cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-  [[maybe_unused]] auto const grp_result =
-    gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+    requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_TRUE(test_run);
 }
 
-TEST_F(HostUDFTest, GroupbySomeInput)
+TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations)
 {
-  auto const keys      = int32s_col{0, 1, 2};
-  auto const vals      = int32s_col{0, 1, 2};
-  auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{
-    cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
-    cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
-    cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  auto const keys = int32s_col{0, 1, 2};
+  auto const vals = int32s_col{0, 1, 2};
+
+  constexpr int NUM_RANDOM_TESTS = 20;
+
   for (int i = 0; i < NUM_RANDOM_TESTS; ++i) {
-    bool test_run    = false;
-    auto input_attrs = get_subset(all_attrs);
-    input_attrs.insert(get_random_agg());
-    auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
-      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, std::move(input_attrs)));
+    bool test_run = false;
+    auto agg      = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, /*test_other_agg*/ true));
 
     std::vector<cudf::groupby::aggregation_request> requests;
     requests.emplace_back();
@@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput)
     requests[0].aggregations.push_back(std::move(agg));
     cudf::groupby::groupby gb_obj(
       cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
-    [[maybe_unused]] auto const grp_result =
-      gb_obj.aggregate(requests, cudf::test::get_default_stream());
+    [[maybe_unused]] auto const grp_result = gb_obj.aggregate(
+      requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     EXPECT_TRUE(test_run);
   }
 }
diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu
new file mode 100644
index 00000000000..67b88c5306b
--- /dev/null
+++ b/cpp/tests/reductions/host_udf_example_tests.cu
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
+
+namespace {
+/**
+ * @brief A host-based UDF implementation for reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_reduction_example : cudf::reduce_host_udf {
+  host_udf_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+    cudf::column_view const& input,
+    cudf::data_type output_dtype,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(
+      input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_reduction_example>();
+  }
+
+  struct reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::scalar> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    [[nodiscard]] std::unique_ptr<cudf::scalar> operator()(
+      cudf::column_view const& input,
+      cudf::data_type output_dtype,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      if (input.size() == 0) {
+        return cudf::make_default_constructed_scalar(output_dtype, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto const result       = thrust::transform_reduce(rmm::exec_policy(stream),
+                                                   thrust::make_counting_iterator(0),
+                                                   thrust::make_counting_iterator(input.size()),
+                                                   transform_fn{*input_dv_ptr},
+                                                   static_cast<OutputType>(init_value),
+                                                   thrust::plus<>{});
+
+      auto output = cudf::make_numeric_scalar(output_dtype, stream, mr);
+      static_cast<cudf::scalar_type_t<OutputType>*>(output.get())->set_value(result, stream);
+      return output;
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      OutputType __device__ operator()(cudf::size_type idx) const
+      {
+        if (values.is_null(idx)) { return OutputType{0}; }
+        auto const val = static_cast<OutputType>(values.element<InputType>(idx));
+        return val * val;
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFReductionExampleTest, SimpleInput)
+{
+  auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_TRUE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+  auto const result =
+    static_cast<cudf::scalar_type_t<int64_t>*>(reduced.get())->value(cudf::get_default_stream());
+  auto constexpr expected = 55;  // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55
+  EXPECT_EQ(expected, result);
+}
+
+TEST_F(HostUDFReductionExampleTest, EmptyInput)
+{
+  auto const vals = doubles_col{};
+  auto const agg  = cudf::make_host_udf_aggregation<cudf::reduce_aggregation>(
+    std::make_unique<host_udf_reduction_example>());
+  auto const reduced = cudf::reduce(vals,
+                                    *agg,
+                                    cudf::data_type{cudf::type_id::INT64},
+                                    cudf::get_default_stream(),
+                                    cudf::get_current_device_resource_ref());
+  EXPECT_FALSE(reduced->is_valid());
+  EXPECT_EQ(cudf::type_id::INT64, reduced->type().id());
+}
+
+namespace {
+
+/**
+ * @brief A host-based UDF implementation for segmented reduction.
+ *
+ * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared).
+ */
+struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf {
+  host_udf_segmented_reduction_example() = default;
+
+  [[nodiscard]] std::unique_ptr<cudf::column> operator()(
+    cudf::column_view const& input,
+    cudf::device_span<cudf::size_type const> offsets,
+    cudf::data_type output_dtype,
+    cudf::null_policy null_handling,
+    std::optional<std::reference_wrapper<cudf::scalar const>> init,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::double_type_dispatcher(input.type(),
+                                        output_dtype,
+                                        segmented_reduce_fn{},
+                                        input,
+                                        offsets,
+                                        output_dtype,
+                                        null_handling,
+                                        init,
+                                        stream,
+                                        mr);
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_segmented_reduction_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_segmented_reduction_example>();
+  }
+
+  struct segmented_reduce_fn {
+    // For simplicity, this example only accepts a single type input and output.
+    using InputType  = double;
+    using OutputType = int64_t;
+
+    template <typename T,
+              typename U,
+              typename... Args,
+              CUDF_ENABLE_IF(!std::is_same_v<InputType, T> || !std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input/output type.");
+    }
+
+    template <typename T,
+              typename U,
+              CUDF_ENABLE_IF(std::is_same_v<InputType, T>&& std::is_same_v<OutputType, U>)>
+    std::unique_ptr<cudf::column> operator()(
+      cudf::column_view const& input,
+      cudf::device_span<cudf::size_type const> offsets,
+      cudf::data_type output_dtype,
+      cudf::null_policy null_handling,
+      std::optional<std::reference_wrapper<cudf::scalar const>> init,
+      rmm::cuda_stream_view stream,
+      rmm::device_async_resource_ref mr) const
+    {
+      CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id<OutputType>()},
+                   "Invalid output type.");
+      CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
+      auto const num_segments = static_cast<cudf::size_type>(offsets.size()) - 1;
+
+      if (input.size() == 0) {
+        if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); }
+        return cudf::make_numeric_column(
+          output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr);
+      }
+
+      auto const init_value = [&]() -> InputType {
+        if (init.has_value() && init.value().get().is_valid(stream)) {
+          auto const numeric_init_scalar =
+            dynamic_cast<cudf::numeric_scalar<InputType> const*>(&init.value().get());
+          CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction.");
+          return numeric_init_scalar->value(stream);
+        }
+        return InputType{0};
+      }();
+
+      auto const input_dv_ptr = cudf::column_device_view::create(input, stream);
+      auto output             = cudf::make_numeric_column(
+        output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream);
+
+      // Store row index if it is valid, otherwise store a negative value denoting a null row.
+      rmm::device_uvector<cudf::size_type> valid_idx(num_segments, stream);
+
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_segments),
+        thrust::make_zip_iterator(output->mutable_view().begin<OutputType>(), valid_idx.begin()),
+        transform_fn{*input_dv_ptr, offsets, static_cast<OutputType>(init_value), null_handling});
+
+      auto const valid_idx_cv = cudf::column_view{
+        cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0};
+      return std::move(cudf::gather(cudf::table_view{{output->view()}},
+                                    valid_idx_cv,
+                                    cudf::out_of_bounds_policy::NULLIFY,
+                                    stream,
+                                    mr)
+                         ->release()
+                         .front());
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      cudf::device_span<cudf::size_type const> offsets;
+      OutputType init_value;
+      cudf::null_policy null_handling;
+
+      thrust::tuple<OutputType, cudf::size_type> __device__ operator()(cudf::size_type idx) const
+      {
+        auto const start = offsets[idx];
+        auto const end   = offsets[idx + 1];
+
+        auto constexpr invalid_idx = cuda::std::numeric_limits<cudf::size_type>::lowest();
+        if (start == end) { return {OutputType{0}, invalid_idx}; }
+
+        auto sum = init_value;
+        for (auto i = start; i < end; ++i) {
+          if (values.is_null(i)) {
+            if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; }
+            continue;
+          }
+          auto const val = static_cast<OutputType>(values.element<InputType>(i));
+          sum += val * val;
+        }
+        auto const segment_size = end - start;
+        return {static_cast<OutputType>(segment_size) * sum, idx};
+      }
+    };
+  };
+};
+
+}  // namespace
+
+struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput)
+{
+  double constexpr null = 0.0;
+  auto const vals       = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0},
+                                      {true, false, true, true, false, true, false, false, true, true}};
+  auto const offsets    = int32s_col{0, 3, 5, 10}.release();
+  auto const agg        = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+
+  // Test without init value.
+  {
+    auto const result = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      std::nullopt,  // init value
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // Since init value is not given, it is set to 0.
+    // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{12, 18, 850}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and include nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::INCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // When null_policy is set to `INCLUDE`, the null values are replaced with the init value.
+    // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{48, 42, 955}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+
+  // Test with init value, and exclude nulls.
+  {
+    auto const init_scalar = cudf::make_fixed_width_scalar<double>(3.0);
+    auto const result      = cudf::segmented_reduce(
+      vals,
+      cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+      *agg,
+      cudf::data_type{cudf::type_id::INT64},
+      cudf::null_policy::EXCLUDE,
+      *init_scalar,
+      cudf::get_default_stream(),
+      cudf::get_current_device_resource_ref());
+
+    // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ]
+    auto const expected = int64s_col{{21, 24, 865}, {true, true, true}};
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{0, 0, 0, 0}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{{0, 0, 0}, {false, false, false}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput)
+{
+  auto const vals    = doubles_col{};
+  auto const offsets = int32s_col{}.release();
+  auto const agg     = cudf::make_host_udf_aggregation<cudf::segmented_reduce_aggregation>(
+    std::make_unique<host_udf_segmented_reduction_example>());
+  auto const result = cudf::segmented_reduce(
+    vals,
+    cudf::device_span<int const>(offsets->view().begin<int>(), offsets->size()),
+    *agg,
+    cudf::data_type{cudf::type_id::INT64},
+    cudf::null_policy::INCLUDE,
+    std::nullopt,  // init value
+    cudf::get_default_stream(),
+    cudf::get_current_device_resource_ref());
+  auto const expected = int64s_col{};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 2276b223740..c07a58ed8a5 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 27966ddfdd4..234a9ec1ced 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
index ba8ae379bae..4f047a68f06 100644
--- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -161,14 +161,14 @@ public static ReductionAggregation median() {
   /**
    * Aggregate to compute the specified quantiles. Uses linear interpolation by default.
    */
-  public static ReductionAggregation quantile(double ... quantiles) {
+  public static ReductionAggregation quantile(double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(quantiles));
   }
 
   /**
    * Aggregate to compute various quantiles.
    */
-  public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) {
+  public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) {
     return new ReductionAggregation(Aggregation.quantile(method, quantiles));
   }
 
@@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() {
    * @param nanEquality  Flag to specify whether NaN values in floating point column should be considered equal.
    */
   public static ReductionAggregation collectSet(NullPolicy nullPolicy,
-      NullEquality nullEquality, NaNEquality nanEquality) {
+                                                NullEquality nullEquality, NaNEquality nanEquality) {
     return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality));
   }
 
@@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual
     return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality));
   }
 
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new ReductionAggregation instance
+   */
+  public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new ReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
+
   /**
    * Create HistogramAggregation, computing the frequencies for each unique row.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
index 7ed150a2fec..18e7d874886 100644
--- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() {
   public static SegmentedReductionAggregation all() {
     return new SegmentedReductionAggregation(Aggregation.all());
   }
+
+  /**
+   * Execute a reduction using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new SegmentedReductionAggregation instance
+   */
+  public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper));
+  }
 }

From bbf4f7824c23c0c482f52bafdf1ece1213da2f65 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 13 Jan 2025 11:44:54 -0800
Subject: [PATCH 08/12] Host compression (#17656)

Add compression APIs to make the nvCOMP use transparent.
Remove direct dependency on nvCOMP in the ORC and Parquet writers.
Add multi-threaded host-side compression; currently off by default, can only be enabled via `LIBCUDF_USE_HOST_COMPRESSION` environment variable.

Currently the host compression adds D2H + H2D transfers. Avoiding the H2D transfer requires large changes to the writers.

Also moved handling of the AUTO compression type to the options classes, which should own such defaults (translate AUTO to SNAPPY in this case).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17656
---
 cpp/CMakeLists.txt                         |   2 +-
 cpp/include/cudf/io/orc.hpp                |  22 ++-
 cpp/src/io/comp/comp.cpp                   | 163 ++++++++++++++++++++-
 cpp/src/io/comp/{statistics.cu => comp.cu} |   4 +-
 cpp/src/io/comp/comp.hpp                   |  54 ++++++-
 cpp/src/io/comp/gpuinflate.hpp             |  15 +-
 cpp/src/io/functions.cpp                   |   3 +-
 cpp/src/io/orc/orc_gpu.hpp                 |   4 +-
 cpp/src/io/orc/stripe_enc.cu               |  49 +------
 cpp/src/io/orc/writer_impl.cu              | 144 ++++++------------
 cpp/src/io/orc/writer_impl.hpp             |   4 +-
 cpp/src/io/parquet/writer_impl.cu          |  99 +++++--------
 cpp/src/io/parquet/writer_impl.hpp         |   4 +-
 cpp/src/io/parquet/writer_impl_helpers.cpp |  46 +-----
 cpp/src/io/parquet/writer_impl_helpers.hpp |  38 +----
 cpp/tests/io/orc_test.cpp                  |   3 +-
 cpp/tests/io/parquet_misc_test.cpp         |   3 +-
 17 files changed, 338 insertions(+), 319 deletions(-)
 rename cpp/src/io/comp/{statistics.cu => comp.cu} (96%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 9dabe4e8800..252cc7897d8 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -477,13 +477,13 @@ add_library(
   src/io/avro/reader_impl.cu
   src/io/comp/brotli_dict.cpp
   src/io/comp/comp.cpp
+  src/io/comp/comp.cu
   src/io/comp/cpu_unbz2.cpp
   src/io/comp/debrotli.cu
   src/io/comp/gpuinflate.cu
   src/io/comp/nvcomp_adapter.cpp
   src/io/comp/nvcomp_adapter.cu
   src/io/comp/snap.cu
-  src/io/comp/statistics.cu
   src/io/comp/uncomp.cpp
   src/io/comp/unsnap.cu
   src/io/csv/csv_gpu.cu
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 163fa20806d..82f7761da2e 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ class orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify frequency of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -733,7 +733,11 @@ class orc_writer_options {
    *
    * @param comp Compression type
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection.
@@ -865,7 +869,7 @@ class orc_writer_options_builder {
    */
   orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
@@ -1026,7 +1030,7 @@ class chunked_orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify granularity of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -1157,7 +1161,11 @@ class chunked_orc_writer_options {
    *
    * @param comp The compression type to use
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection
@@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder {
    */
   chunked_orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index 26535bed43b..3800835eaf1 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,45 @@
 
 #include "comp.hpp"
 
+#include "gpuinflate.hpp"
+#include "io/utilities/getenv_or.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "nvcomp_adapter.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <zlib.h>  // GZIP compression
 
 namespace cudf::io::detail {
 
 namespace {
 
+auto& h_comp_pool()
+{
+  static std::size_t pool_size =
+    getenv_or("LIBCUDF_HOST_COMPRESSION_NUM_THREADS", std::thread::hardware_concurrency());
+  static BS::thread_pool pool(pool_size);
+  return pool;
+}
+
+std::optional<nvcomp::compression_type> to_nvcomp_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case compression_type::ZSTD: return nvcomp::compression_type::ZSTD;
+    case compression_type::LZ4: return nvcomp::compression_type::LZ4;
+    case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE;
+    default: return std::nullopt;
+  }
+}
+
 /**
  * @brief GZIP host compressor (includes header)
  */
@@ -98,8 +121,132 @@ std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
   return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
 }
 
+void device_compress(compression_type compression,
+                     device_span<device_span<uint8_t const> const> inputs,
+                     device_span<device_span<uint8_t> const> outputs,
+                     device_span<compression_result> results,
+                     rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  auto nvcomp_disabled   = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type)
+                                                   : "invalid compression type";
+  if (not nvcomp_disabled) {
+    return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream);
+  }
+
+  switch (compression) {
+    case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream);
+    default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value());
+  }
+}
+
+void host_compress(compression_type compression,
+                   device_span<device_span<uint8_t const> const> inputs,
+                   device_span<device_span<uint8_t> const> outputs,
+                   device_span<compression_result> results,
+                   rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const num_chunks = inputs.size();
+  auto h_results        = cudf::detail::make_host_vector<compression_result>(num_chunks, stream);
+  auto const h_inputs   = cudf::detail::make_host_vector_async(inputs, stream);
+  auto const h_outputs  = cudf::detail::make_host_vector_async(outputs, stream);
+  stream.synchronize();
+
+  std::vector<std::future<size_t>> tasks;
+  auto const num_streams =
+    std::min<std::size_t>({num_chunks,
+                           cudf::detail::global_cuda_stream_pool().get_stream_pool_size(),
+                           h_comp_pool().get_thread_count()});
+  auto const streams = cudf::detail::fork_streams(stream, num_streams);
+  for (size_t i = 0; i < num_chunks; ++i) {
+    auto const cur_stream = streams[i % streams.size()];
+    auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t {
+      auto const h_in  = cudf::detail::make_host_vector_sync(d_in, cur_stream);
+      auto const h_out = compress(compression, h_in, cur_stream);
+      cudf::detail::cuda_memcpy<uint8_t>(d_out.subspan(0, h_out.size()), h_out, cur_stream);
+      return h_out.size();
+    };
+    tasks.emplace_back(h_comp_pool().submit_task(std::move(task)));
+  }
+
+  for (auto i = 0ul; i < num_chunks; ++i) {
+    h_results[i] = {tasks[i].get(), compression_status::SUCCESS};
+  }
+  cudf::detail::cuda_memcpy_async<compression_result>(results, h_results, stream);
+}
+
+[[nodiscard]] bool host_compression_supported(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::GZIP:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool device_compression_supported(compression_type compression)
+{
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  switch (compression) {
+    case compression_type::LZ4:
+    case compression_type::ZLIB:
+    case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value());
+    case compression_type::SNAPPY:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool use_host_compression(
+  compression_type compression,
+  [[maybe_unused]] device_span<device_span<uint8_t const> const> inputs,
+  [[maybe_unused]] device_span<device_span<uint8_t> const> outputs)
+{
+  CUDF_EXPECTS(
+    not host_compression_supported(compression) or device_compression_supported(compression),
+    "Unsupported compression type");
+  if (not host_compression_supported(compression)) { return false; }
+  if (not device_compression_supported(compression)) { return true; }
+  // If both host and device compression are supported, use the host if the env var is set
+  return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0);
+}
+
 }  // namespace
 
+std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
+{
+  if (auto nvcomp_type = to_nvcomp_compression(compression);
+      nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type);
+  }
+  return std::nullopt;
+}
+
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression)
+{
+  auto nvcomp_type = to_nvcomp_compression(compression);
+  if (compression == compression_type::NONE or not nvcomp_type.has_value() or
+      nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return 1ul;
+  }
+
+  return nvcomp::required_alignment(*nvcomp_type);
+}
+
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size)
+{
+  if (compression == compression_type::NONE) { return uncompressed_size; }
+
+  if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) {
+    return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size);
+  }
+  CUDF_FAIL("Unsupported compression type");
+}
+
 std::vector<std::uint8_t> compress(compression_type compression,
                                    host_span<uint8_t const> src,
                                    rmm::cuda_stream_view stream)
@@ -112,4 +259,18 @@ std::vector<std::uint8_t> compress(compression_type compression,
   }
 }
 
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (use_host_compression(compression, inputs, outputs)) {
+    return host_compress(compression, inputs, outputs, results, stream);
+  } else {
+    return device_compress(compression, inputs, outputs, results, stream);
+  }
+}
+
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/comp.cu
similarity index 96%
rename from cpp/src/io/comp/statistics.cu
rename to cpp/src/io/comp/comp.cu
index caee9145d2c..af0f73869a2 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/comp.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "gpuinflate.hpp"
+#include "comp.hpp"
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp
index e16f26e1f06..90932a11499 100644
--- a/cpp/src/io/comp/comp.hpp
+++ b/cpp/src/io/comp/comp.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,5 +57,57 @@ std::vector<uint8_t> compress(compression_type compression,
                               host_span<uint8_t const> src,
                               rmm::cuda_stream_view stream);
 
+/**
+ * @brief Maximum size of uncompressed chunks that can be compressed.
+ *
+ * @param compression Compression type
+ * @returns maximum chunk size
+ */
+[[nodiscard]] std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression);
+
+/**
+ * @brief Gets input and output alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment
+ */
+[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression);
+
+/**
+ * @brief Gets the maximum size any chunk could compress to in the batch.
+ *
+ * @param compression Compression type
+ * @param uncompressed_size Size of the largest uncompressed chunk in the batch
+ */
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size);
+
+/**
+ * @brief Compresses device memory buffers.
+ *
+ * @param compression Type of compression of the input data
+ * @param inputs      Device memory buffers to compress
+ * @param outputs     Device memory buffers to store the compressed output
+ * @param results     Compression results
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ */
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Aggregate results of compression into a single statistics object.
+ *
+ * @param inputs List of uncompressed input buffers
+ * @param results List of compression results
+ * @param stream CUDA stream to use
+ * @return writer_compression_statistics
+ */
+[[nodiscard]] writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream);
+
 }  // namespace io::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 4b09bd5a84c..0a35b230242 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,17 +124,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
-/**
- * @brief Aggregate results of compression into a single statistics object.
- *
- * @param inputs List of uncompressed input buffers
- * @param results List of compression results
- * @param stream CUDA stream to use
- * @return writer_compression_statistics
- */
-[[nodiscard]] writer_compression_statistics collect_compression_statistics(
-  device_span<device_span<uint8_t const> const> inputs,
-  device_span<compression_result const> results,
-  rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 88423122e16..d63fa9f5c35 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l
 void parquet_writer_options_base::set_compression(compression_type compression)
 {
   _compression = compression;
+  if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
 }
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index f4e75f78dec..8b30cee6681 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -407,7 +407,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 79ecca0ca99..4f296bb5bfc 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,6 @@
  */
 
 #include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
@@ -45,8 +44,6 @@ namespace io {
 namespace orc {
 namespace gpu {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 using cudf::detail::device_2dspan;
 using cudf::io::detail::compression_result;
 using cudf::io::detail::compression_status;
@@ -1362,7 +1359,7 @@ void CompactOrcDataStreams(device_2dspan<StripeStream> strm_desc,
 std::optional<writer_compression_statistics> CompressOrcDataStreams(
   device_span<uint8_t> compressed_data,
   uint32_t num_compressed_blocks,
-  CompressionKind compression,
+  compression_type compression,
   uint32_t comp_blk_size,
   uint32_t max_comp_blk_size,
   uint32_t comp_block_align,
@@ -1387,47 +1384,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
                                                                             max_comp_blk_size,
                                                                             comp_block_align);
 
-  if (compression == SNAPPY) {
-    try {
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-    } catch (...) {
-      // There was an error in compressing so set an error status for each block
-      thrust::for_each(
-        rmm::exec_policy(stream),
-        comp_res.begin(),
-        comp_res.end(),
-        [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; });
-      // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
-      // writing without compression
-      CUDF_LOG_WARN("ORC writer: compression failed, writing uncompressed data");
-    }
-  } else if (compression == ZLIB) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(
-      nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
-  } else if (compression == ZSTD) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-  } else if (compression == LZ4) {
-    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-        reason) {
-      CUDF_FAIL("Compression error: " + reason.value());
-    }
-    nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-  } else if (compression != NONE) {
-    CUDF_FAIL("Unsupported compression type");
-  }
+  cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream);
 
   dim3 dim_block_compact(1024, 1);
   gpuCompactCompressedBlocks<<<dim_grid, dim_block_compact, 0, stream.value()>>>(
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ce868b83c04..aa0b509981a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
-#include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
@@ -71,8 +70,6 @@
 
 namespace cudf::io::orc::detail {
 
-namespace nvcomp = cudf::io::detail::nvcomp;
-
 template <typename T>
 [[nodiscard]] constexpr int varint_size(T val)
 {
@@ -92,21 +89,8 @@ struct row_group_index_info {
 };
 
 namespace {
-
 /**
- * @brief Translates ORC compression to nvCOMP compression
- */
-auto to_nvcomp_compression_type(CompressionKind compression_kind)
-{
-  if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE;
-  if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD;
-  if (compression_kind == LZ4) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-/**
- * @brief Translates cuDF compression to ORC compression
+ * @brief Translates cuDF compression to ORC compression.
  */
 orc::CompressionKind to_orc_compression(compression_type compression)
 {
@@ -122,19 +106,14 @@ orc::CompressionKind to_orc_compression(compression_type compression)
 }
 
 /**
- * @brief Returns the block size for a given compression kind.
+ * @brief Returns the block size for a given compression format.
  */
-constexpr size_t compression_block_size(orc::CompressionKind compression)
+size_t compression_block_size(compression_type compression)
 {
-  if (compression == orc::CompressionKind::NONE) { return 0; }
-
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
   constexpr size_t max_block_size = 256 * 1024;
-  return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
+  return std::min(comp_limit.value_or(max_block_size), max_block_size);
 }
 
 /**
@@ -534,26 +513,6 @@ size_t RLE_stream_size(TypeKind kind, size_t count)
   }
 }
 
-auto uncomp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
-auto comp_block_alignment(CompressionKind compression_kind)
-{
-  if (compression_kind == NONE or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1ul;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
-}
-
 /**
  * @brief Builds up per-column streams.
  *
@@ -566,7 +525,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
                            file_segmentation const& segmentation,
                            std::map<uint32_t, size_t> const& decimal_column_sizes,
                            bool enable_dictionary,
-                           CompressionKind compression_kind,
+                           compression_type compression,
                            single_write_mode write_mode)
 {
   // 'column 0' row index stream
@@ -610,7 +569,7 @@ orc_streams create_streams(host_span<orc_column_view> columns,
 
     auto add_stream =
       [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) {
-        auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1;
+        auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1;
         const auto base                  = column.index() * gpu::CI_NUM_STREAMS;
         ids[base + index_type]           = streams.size();
         streams.push_back(orc::Stream{
@@ -1473,7 +1432,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
  * @param[in] rg_stats row group level statistics
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] compression_blocksize The block size used for compression
  * @param[in] out_sink Sink for writing data
  */
@@ -1487,7 +1446,7 @@ void write_index_stream(int32_t stripe_id,
                         host_span<ColStatsBlob const> rg_stats,
                         StripeInformation* stripe,
                         orc_streams* streams,
-                        CompressionKind compression_kind,
+                        compression_type compression,
                         size_t compression_blocksize,
                         std::unique_ptr<data_sink> const& out_sink)
 {
@@ -1501,7 +1460,7 @@ void write_index_stream(int32_t stripe_id,
     row_group_index_info record;
     if (stream.ids[type] > 0) {
       record.pos = 0;
-      if (compression_kind != NONE) {
+      if (compression != compression_type::NONE) {
         auto const& ss   = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)];
         record.blk_pos   = ss.first_block;
         record.comp_pos  = 0;
@@ -1541,7 +1500,7 @@ void write_index_stream(int32_t stripe_id,
     }
   }
 
-  ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((compression != compression_type::NONE) ? 3 : 0);
 
   // Add row index entries
   auto const& rowgroups_range = segmentation.stripes[stripe_id];
@@ -1566,7 +1525,7 @@ void write_index_stream(int32_t stripe_id,
   });
 
   (*streams)[stream_id].length = pbw.size();
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1;
     pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_ix_len >> 0);
     pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_ix_len >> 8);
@@ -1585,7 +1544,7 @@ void write_index_stream(int32_t stripe_id,
  * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer
  * @param[in,out] stripe Stream's parent stripe
  * @param[in,out] streams List of all streams
- * @param[in] compression_kind The compression kind
+ * @param[in] compression The compression format
  * @param[in] out_sink Sink for writing data
  * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @return An std::future that should be synchronized to ensure the writing is complete
@@ -1596,7 +1555,7 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
                                     host_span<uint8_t> bounce_buffer,
                                     StripeInformation* stripe,
                                     orc_streams* streams,
-                                    CompressionKind compression_kind,
+                                    compression_type compression,
                                     std::unique_ptr<data_sink> const& out_sink,
                                     rmm::cuda_stream_view stream)
 {
@@ -1606,8 +1565,9 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
     return std::async(std::launch::deferred, [] {});
   }
 
-  auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type]
-                                                     : (compressed_data + strm_desc.bfr_offset);
+  auto const* stream_in = (compression == compression_type::NONE)
+                            ? enc_stream.data_ptrs[strm_desc.stream_type]
+                            : (compressed_data + strm_desc.bfr_offset);
 
   auto write_task = [&]() {
     if (out_sink->is_device_write_preferred(length)) {
@@ -1627,15 +1587,15 @@ std::future<void> write_data_stream(gpu::StripeStream const& strm_desc,
 /**
  * @brief Insert 3-byte uncompressed block headers in a byte vector
  *
- * @param compression_kind The compression kind
+ * @param compression The compression kind
  * @param compression_blocksize The block size used for compression
  * @param v The destitation byte vector to write, which must include initial 3-byte header
  */
-void add_uncompressed_block_headers(CompressionKind compression_kind,
+void add_uncompressed_block_headers(compression_type compression,
                                     size_t compression_blocksize,
                                     std::vector<uint8_t>& v)
 {
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     size_t uncomp_len = v.size() - 3, pos = 0, block_len;
     while (uncomp_len > compression_blocksize) {
       block_len  = compression_blocksize * 2 + 1;
@@ -2021,14 +1981,6 @@ std::map<uint32_t, size_t> decimal_column_sizes(
   return column_sizes;
 }
 
-size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize)
-{
-  if (compression_kind == NONE) return 0;
-
-  return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
-                                                compression_blocksize);
-}
-
 std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
 {
   auto table_meta = std::make_unique<table_input_metadata>(input);
@@ -2287,7 +2239,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
  * @param row_index_stride The row index stride
  * @param enable_dictionary Whether dictionary is enabled
  * @param sort_dictionaries Whether to sort the dictionaries
- * @param compression_kind The compression kind
+ * @param compression The compression format
  * @param compression_blocksize The block size used for compression
  * @param stats_freq Column statistics granularity type for parquet/orc writers
  * @param collect_compression_stats Flag to indicate if compression statistics should be collected
@@ -2302,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input,
                                size_type row_index_stride,
                                bool enable_dictionary,
                                bool sort_dictionaries,
-                               CompressionKind compression_kind,
+                               compression_type compression,
                                size_t compression_blocksize,
                                statistics_freq stats_freq,
                                bool collect_compression_stats,
@@ -2329,17 +2281,16 @@ auto convert_table_to_orc_data(table_view const& input,
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind);
-  auto const compressed_block_align   = comp_block_alignment(compression_kind);
+  auto const block_align = compress_required_chunk_alignment(compression);
 
   auto streams  = create_streams(orc_table.columns,
                                 segmentation,
                                 decimal_column_sizes(dec_chunk_sizes.rg_sizes),
                                 enable_dictionary,
-                                compression_kind,
+                                compression,
                                 write_mode);
   auto enc_data = encode_columns(
-    orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream);
+    orc_table, std::move(dec_chunk_sizes), segmentation, streams, block_align, stream);
 
   stripe_dicts.on_encode_complete(stream);
 
@@ -2371,16 +2322,15 @@ auto convert_table_to_orc_data(table_view const& input,
   size_t compressed_bfr_size   = 0;
   size_t num_compressed_blocks = 0;
 
-  auto const max_compressed_block_size =
-    max_compression_output_size(compression_kind, compression_blocksize);
+  auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize);
   auto const padded_max_compressed_block_size =
-    util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(max_compressed_block_size, block_align);
   auto const padded_block_header_size =
-    util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
+    util::round_up_unsafe<size_t>(block_header_size, block_align);
 
   for (auto& ss : strm_descs.host_view().flat_view()) {
     size_t stream_size = ss.stream_size;
-    if (compression_kind != NONE) {
+    if (compression != compression_type::NONE) {
       ss.first_block = num_compressed_blocks;
       ss.bfr_offset  = compressed_bfr_size;
 
@@ -2401,14 +2351,14 @@ auto convert_table_to_orc_data(table_view const& input,
                comp_results.d_begin(),
                comp_results.d_end(),
                compression_result{0, compression_status::FAILURE});
-  if (compression_kind != NONE) {
+  if (compression != compression_type::NONE) {
     strm_descs.host_to_device_async(stream);
     compression_stats = gpu::CompressOrcDataStreams(compressed_data,
                                                     num_compressed_blocks,
-                                                    compression_kind,
+                                                    compression,
                                                     compression_blocksize,
                                                     max_compressed_block_size,
-                                                    compressed_block_align,
+                                                    block_align,
                                                     collect_compression_stats,
                                                     strm_descs,
                                                     enc_data.streams,
@@ -2459,8 +2409,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2480,8 +2430,8 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   : _stream(stream),
     _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()},
     _row_index_stride{options.get_row_index_stride()},
-    _compression_kind(to_orc_compression(options.get_compression())),
-    _compression_blocksize(compression_block_size(_compression_kind)),
+    _compression{options.get_compression()},
+    _compression_blocksize(compression_block_size(_compression)),
     _compression_statistics(options.get_compression_statistics()),
     _stats_freq(options.get_statistics_freq()),
     _sort_dictionaries{options.get_enable_dictionary_sort()},
@@ -2526,7 +2476,7 @@ void writer::impl::write(table_view const& input)
                                        _row_index_stride,
                                        _enable_dictionary,
                                        _sort_dictionaries,
-                                       _compression_kind,
+                                       _compression,
                                        _compression_blocksize,
                                        _stats_freq,
                                        _compression_statistics != nullptr,
@@ -2613,7 +2563,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
                          rg_stats,
                          &stripe,
                          &streams,
-                         _compression_kind,
+                         _compression,
                          _compression_blocksize,
                          _out_sink);
     }
@@ -2627,7 +2577,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
         bounce_buffer,
         &stripe,
         &streams,
-        _compression_kind,
+        _compression,
         _out_sink,
         _stream));
     }
@@ -2645,10 +2595,10 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
           : 0;
       if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
     }
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(sf);
     stripe.footerLength = pbw.size();
-    if (_compression_kind != NONE) {
+    if (_compression != compression_type::NONE) {
       uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
       pbw.buffer()[0]        = static_cast<uint8_t>(uncomp_sf_len >> 0);
       pbw.buffer()[1]        = static_cast<uint8_t>(uncomp_sf_len >> 8);
@@ -2780,21 +2730,21 @@ void writer::impl::close()
 
   // Write statistics metadata
   if (not _orc_meta.stripeStats.empty()) {
-    ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+    ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
     pbw.write(_orc_meta);
-    add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+    add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
     ps.metadataLength = pbw.size();
     _out_sink->host_write(pbw.data(), pbw.size());
   } else {
     ps.metadataLength = 0;
   }
-  ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0);
+  ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
   pbw.write(_footer);
-  add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer());
+  add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer());
 
   // Write postscript metadata
   ps.footerLength         = pbw.size();
-  ps.compression          = _compression_kind;
+  ps.compression          = to_orc_compression(_compression);
   ps.compressionBlockSize = _compression_blocksize;
   ps.version              = {0, 12};  // Hive 0.12
   ps.writerVersion        = cudf_writer_version;
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index cae849ee315..7d23482cb17 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -342,7 +342,7 @@ class writer::impl {
   // Writer options.
   stripe_size_limits const _max_stripe_size;
   size_type const _row_index_stride;
-  CompressionKind const _compression_kind;
+  compression_type const _compression;
   size_t const _compression_blocksize;
   std::shared_ptr<writer_compression_statistics> _compression_statistics;  // Optional output
   statistics_freq const _stats_freq;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 77924ac0f35..1b67b53ae8e 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -23,8 +23,7 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "interop/decimal_conversion_utilities.cuh"
-#include "io/comp/gpuinflate.hpp"
-#include "io/comp/nvcomp_adapter.hpp"
+#include "io/comp/comp.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -67,6 +66,20 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
 struct aggregate_writer_metadata {
   aggregate_writer_metadata(host_span<partition_info const> partitions,
                             host_span<std::map<std::string, std::string> const> kv_md,
@@ -1172,7 +1185,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      size_t max_page_size_bytes,
                      size_type max_page_size_rows,
                      bool write_v2_headers,
-                     Compression compression_codec,
+                     compression_type compression,
                      rmm::cuda_stream_view stream)
 {
   if (chunks.is_empty()) { return cudf::detail::hostdevice_vector<size_type>{}; }
@@ -1187,7 +1200,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1212,7 +1225,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1221,12 +1234,10 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
 
   // Get per-page max compressed size
   cudf::detail::hostdevice_vector<size_type> comp_page_sizes(num_pages, stream);
-  std::transform(page_sizes.begin(),
-                 page_sizes.end(),
-                 comp_page_sizes.begin(),
-                 [compression_codec](auto page_size) {
-                   return max_compression_output_size(compression_codec, page_size);
-                 });
+  std::transform(
+    page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [compression](auto page_size) {
+      return max_compressed_size(compression, page_size);
+    });
   comp_page_sizes.host_to_device_async(stream);
 
   // Use per-page max compressed size to calculate chunk.compressed_size
@@ -1238,7 +1249,7 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression_codec),
+                   compress_required_chunk_alignment(compression),
                    write_v2_headers,
                    nullptr,
                    nullptr,
@@ -1247,16 +1258,13 @@ auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
   return comp_page_sizes;
 }
 
-size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
+size_t max_page_bytes(compression_type compression, size_t max_page_size_bytes)
 {
-  if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; }
+  if (compression == compression_type::NONE) { return max_page_size_bytes; }
 
-  auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
-                              ? std::nullopt
-                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
+  auto const comp_limit = compress_max_allowed_chunk_size(compression);
 
-  auto max_size = std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
+  auto max_size = std::min(comp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
   // page size must fit in a 32-bit signed integer
   return std::min<size_t>(max_size, std::numeric_limits<int32_t>::max());
 }
@@ -1265,7 +1273,7 @@ std::pair<std::vector<rmm::device_uvector<size_type>>, std::vector<rmm::device_u
 build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
                          host_span<parquet_column_device_view const> col_desc,
                          device_2dspan<PageFragment const> frags,
-                         Compression compression,
+                         compression_type compression,
                          dictionary_policy dict_policy,
                          size_t max_dict_size,
                          rmm::cuda_stream_view stream)
@@ -1404,7 +1412,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
  * @param num_columns Total number of columns
  * @param num_pages Total number of pages
  * @param num_stats_bfr Number of statistics buffers
- * @param compression Compression format
+ * @param alignment Page alignment
  * @param max_page_size_bytes Maximum uncompressed page size, in bytes
  * @param max_page_size_rows Maximum page size, in rows
  * @param write_v2_headers True if version 2 page headers are to be written
@@ -1419,7 +1427,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                         uint32_t num_columns,
                         uint32_t num_pages,
                         uint32_t num_stats_bfr,
-                        Compression compression,
+                        size_t alignment,
                         size_t max_page_size_bytes,
                         size_type max_page_size_rows,
                         bool write_v2_headers,
@@ -1435,7 +1443,7 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                    num_columns,
                    max_page_size_bytes,
                    max_page_size_rows,
-                   page_alignment(compression),
+                   alignment,
                    write_v2_headers,
                    (num_stats_bfr) ? page_stats_mrg.data() : nullptr,
                    (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr,
@@ -1478,7 +1486,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
                   std::optional<writer_compression_statistics>& comp_stats,
-                  Compression compression,
+                  compression_type compression,
                   int32_t column_index_truncate_length,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
@@ -1488,7 +1496,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                            ? device_span<statistics_chunk const>(page_stats, num_pages)
                            : device_span<statistics_chunk const>();
 
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
+  uint32_t max_comp_pages = (compression != compression_type::NONE) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1499,34 +1507,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                compression_result{0, compression_status::FAILURE});
 
   EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
-  switch (compression) {
-    case Compression::SNAPPY:
-      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
-      } else {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      }
-      break;
-    case Compression::ZSTD: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::LZ4_RAW: {
-      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4);
-          reason) {
-        CUDF_FAIL("Compression error: " + reason.value());
-      }
-      nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream);
-      break;
-    }
-    case Compression::UNCOMPRESSED: break;
-    default: CUDF_FAIL("invalid compression type");
-  }
+  compress(compression, comp_in, comp_out, comp_res, stream);
 
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
@@ -1744,7 +1725,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    size_type max_page_size_rows,
                                    int32_t column_index_truncate_length,
                                    statistics_freq stats_granularity,
-                                   Compression compression,
+                                   compression_type compression,
                                    bool collect_compression_statistics,
                                    dictionary_policy dict_policy,
                                    size_t max_dictionary_size,
@@ -2146,7 +2127,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   // Clear compressed buffer size if compression has been turned off
-  if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
+  if (compression == compression_type::NONE) { max_comp_bfr_size = 0; }
 
   // Initialize data pointers
   uint32_t const num_stats_bfr =
@@ -2214,7 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                        num_columns,
                        num_pages,
                        num_stats_bfr,
-                       compression,
+                       compress_required_chunk_alignment(compression),
                        max_page_size_bytes,
                        max_page_size_rows,
                        write_v2_headers,
@@ -2270,7 +2251,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         auto const dev_bfr      = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr;
         auto& column_chunk_meta = row_group.columns[i].meta_data;
 
-        if (ck.is_compressed) { column_chunk_meta.codec = compression; }
+        if (ck.is_compressed) { column_chunk_meta.codec = to_parquet_compression(compression); }
         if (!out_sink[p]->is_device_write_preferred(ck.compressed_size)) {
           all_device_write = false;
         }
@@ -2375,7 +2356,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
@@ -2406,7 +2387,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
                    single_write_mode mode,
                    rmm::cuda_stream_view stream)
   : _stream(stream),
-    _compression(to_parquet_compression(options.get_compression())),
+    _compression(options.get_compression()),
     _max_row_group_size{options.get_row_group_size_bytes()},
     _max_row_group_rows{options.get_row_group_size_rows()},
     _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())),
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 63128faf993..d5a5a534b93 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -144,7 +144,7 @@ class writer::impl {
   rmm::cuda_stream_view _stream;
 
   // Writer options.
-  Compression const _compression;
+  compression_type const _compression;
   size_t const _max_row_group_size;
   size_type const _max_row_group_rows;
   size_t const _max_page_size_bytes;
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index f15ea1f3c37..ede788c97c2 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,8 +21,6 @@
 
 #include "writer_impl_helpers.hpp"
 
-#include "io/comp/nvcomp_adapter.hpp"
-
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,48 +30,6 @@ namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;
 
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
-{
-  switch (codec) {
-    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
-    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
-    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-uint32_t page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 void fill_table_meta(table_input_metadata& table_meta)
 {
   // Fill unnamed columns' names in table_meta
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index 14a9a0ed5b7..b5c73c348fe 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,48 +20,12 @@
  */
 
 #pragma once
-#include "parquet_common.hpp"
 
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/io/detail/parquet.hpp>
-#include <cudf/io/nvcomp_adapter.hpp>
 
 namespace cudf::io::parquet::detail {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression);
-
-/**
- * @brief Function that translates the given compression codec to nvcomp compression type.
- *
- * @param codec Compression codec
- * @return Translated nvcomp compression type
- */
-cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
-
-/**
- * @brief Function that computes input alignment requirements for the given compression type.
- *
- * @param codec Compression codec
- * @return Required alignment
- */
-uint32_t page_alignment(Compression codec);
-
-/**
- * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
- *        batch.
- *
- * @param codec Compression codec
- * @param compression_blocksize Size of the largest uncompressed chunk in the batch
- * @return Maximum compressed chunk size
- */
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
-
 /**
  * @brief Fill the table metadata with default column names.
  *
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 2209a30149d..708c2045a74 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -2068,6 +2068,7 @@ TEST_P(OrcCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(OrcCompressionTest,
                         OrcCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));
diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp
index d66f685cd9c..419ac909ac6 100644
--- a/cpp/tests/io/parquet_misc_test.cpp
+++ b/cpp/tests/io/parquet_misc_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -268,6 +268,7 @@ TEST_P(ParquetCompressionTest, Basic)
 INSTANTIATE_TEST_CASE_P(ParquetCompressionTest,
                         ParquetCompressionTest,
                         ::testing::Values(cudf::io::compression_type::NONE,
+                                          cudf::io::compression_type::AUTO,
                                           cudf::io::compression_type::SNAPPY,
                                           cudf::io::compression_type::LZ4,
                                           cudf::io::compression_type::ZSTD));

From 478ec50edf302a338db043039abad6a2560144ea Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 13 Jan 2025 15:19:44 -0600
Subject: [PATCH 09/12] Precompute AST arity (#17234)

This PR precomputes AST arity on the host, to reduce the complexity in device-side arity lookup.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Basit Ayantunde (https://github.com/lamarrr)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17234
---
 cpp/CMakeLists.txt                            |   1 +
 .../cudf/ast/detail/expression_evaluator.cuh  |   4 +-
 .../cudf/ast/detail/expression_parser.hpp     |  50 ++-
 cpp/include/cudf/ast/detail/operators.hpp     | 418 +++---------------
 cpp/src/ast/expression_parser.cpp             |   3 +-
 cpp/src/ast/operators.cpp                     | 293 ++++++++++++
 6 files changed, 391 insertions(+), 378 deletions(-)
 create mode 100644 cpp/src/ast/operators.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 252cc7897d8..4d83cbd907c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -336,6 +336,7 @@ add_library(
   src/aggregation/result_cache.cpp
   src/ast/expression_parser.cpp
   src/ast/expressions.cpp
+  src/ast/operators.cpp
   src/binaryop/binaryop.cpp
   src/binaryop/compiled/ATan2.cu
   src/binaryop/compiled/Add.cu
diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 9d8762555d7..001b604814c 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -452,7 +452,7 @@ struct expression_evaluator {
          ++operator_index) {
       // Execute operator
       auto const op    = plan.operators[operator_index];
-      auto const arity = ast_operator_arity(op);
+      auto const arity = plan.operator_arities[operator_index];
       if (arity == 1) {
         // Unary operator
         auto const& input =
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index b5973d0ace9..d2e8c1cd41f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/expressions.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -88,6 +89,7 @@ struct expression_device_view {
   device_span<detail::device_data_reference const> data_references;
   device_span<generic_scalar_device_view const> literals;
   device_span<ast_operator const> operators;
+  device_span<cudf::size_type const> operator_arities;
   device_span<cudf::size_type const> operator_source_indices;
   cudf::size_type num_intermediates;
 };
@@ -229,39 +231,55 @@ class expression_parser {
    * @param[in]  v  The `std::vector` containing components (operators, literals, etc).
    * @param[in,out]  sizes  The `std::vector` containing the size of each data buffer.
    * @param[in,out]  data_pointers  The `std::vector` containing pointers to each data buffer.
+   * @param[in,out]  alignment  The maximum alignment needed for all the extracted size and pointers
    */
   template <typename T>
   void extract_size_and_pointer(std::vector<T> const& v,
                                 std::vector<cudf::size_type>& sizes,
-                                std::vector<void const*>& data_pointers)
+                                std::vector<void const*>& data_pointers,
+                                cudf::size_type& alignment)
   {
+    // sub-type alignment will only work provided the alignment is lesser or equal to
+    // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers
+    static_assert(alignof(T) <= alignof(max_align_t));
     auto const data_size = sizeof(T) * v.size();
     sizes.push_back(data_size);
     data_pointers.push_back(v.data());
+    alignment = std::max(alignment, static_cast<cudf::size_type>(alignof(T)));
   }
 
   void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
+    // use a minimum of 4-byte alignment
+    cudf::size_type buffer_alignment = 4;
 
-    extract_size_and_pointer(_data_references, sizes, data_pointers);
-    extract_size_and_pointer(_literals, sizes, data_pointers);
-    extract_size_and_pointer(_operators, sizes, data_pointers);
-    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers);
+    extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment);
+    extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment);
 
     // Create device buffer
-    auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
-    auto buffer_offsets    = std::vector<int>(sizes.size());
-    thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0);
+    auto buffer_offsets = std::vector<cudf::size_type>(sizes.size());
+    thrust::exclusive_scan(sizes.cbegin(),
+                           sizes.cend(),
+                           buffer_offsets.begin(),
+                           cudf::size_type{0},
+                           [buffer_alignment](auto a, auto b) {
+                             // align each component of the AST program
+                             return cudf::util::round_up_safe(a + b, buffer_alignment);
+                           });
+
+    auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back());
+    auto host_data_buffer  = std::vector<char>(buffer_size);
 
-    auto h_data_buffer = std::vector<char>(buffer_size);
     for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
+      std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]);
     }
 
-    _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr);
-
+    _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr);
     stream.synchronize();
 
     // Create device pointers to components of plan
@@ -277,8 +295,11 @@ class expression_parser {
     device_expression_data.operators = device_span<ast_operator const>(
       reinterpret_cast<ast_operator const*>(device_data_buffer_ptr + buffer_offsets[2]),
       _operators.size());
-    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+    device_expression_data.operator_arities = device_span<cudf::size_type const>(
       reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[3]),
+      _operators.size());
+    device_expression_data.operator_source_indices = device_span<cudf::size_type const>(
+      reinterpret_cast<cudf::size_type const*>(device_data_buffer_ptr + buffer_offsets[4]),
       _operator_source_indices.size());
     device_expression_data.num_intermediates = _intermediate_counter.get_max_used();
     shmem_per_thread                         = static_cast<int>(
@@ -322,6 +343,7 @@ class expression_parser {
   bool _has_nulls;
   std::vector<detail::device_data_reference> _data_references;
   std::vector<ast_operator> _operators;
+  std::vector<cudf::size_type> _operator_arities;
   std::vector<cudf::size_type> _operator_source_indices;
   std::vector<generic_scalar_device_view> _literals;
 };
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index 46507700e21..db04e1fe989 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v<Op, T>;
  * @param args Forwarded arguments to `operator()` of `f`.
  */
 template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args)
+CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op,
+                                                                         F&& f,
+                                                                         Ts&&... args)
 {
   switch (op) {
     case ast_operator::ADD:
-      f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ADD>(std::forward<Ts>(args)...);
     case ast_operator::SUB:
-      f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SUB>(std::forward<Ts>(args)...);
     case ast_operator::MUL:
-      f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MUL>(std::forward<Ts>(args)...);
     case ast_operator::DIV:
-      f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::DIV>(std::forward<Ts>(args)...);
     case ast_operator::TRUE_DIV:
-      f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TRUE_DIV>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR_DIV:
-      f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR_DIV>(std::forward<Ts>(args)...);
     case ast_operator::MOD:
-      f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::MOD>(std::forward<Ts>(args)...);
     case ast_operator::PYMOD:
-      f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::PYMOD>(std::forward<Ts>(args)...);
     case ast_operator::POW:
-      f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::POW>(std::forward<Ts>(args)...);
     case ast_operator::EQUAL:
-      f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NULL_EQUAL:
-      f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::NOT_EQUAL:
-      f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::LESS:
-      f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS>(std::forward<Ts>(args)...);
     case ast_operator::GREATER:
-      f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER>(std::forward<Ts>(args)...);
     case ast_operator::LESS_EQUAL:
-      f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LESS_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::GREATER_EQUAL:
-      f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::GREATER_EQUAL>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_AND:
-      f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_AND>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_OR:
-      f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_OR>(std::forward<Ts>(args)...);
     case ast_operator::BITWISE_XOR:
-      f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BITWISE_XOR>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_AND:
-      f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_AND:
-      f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_AND>(std::forward<Ts>(args)...);
     case ast_operator::LOGICAL_OR:
-      f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::NULL_LOGICAL_OR:
-      f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NULL_LOGICAL_OR>(std::forward<Ts>(args)...);
     case ast_operator::IDENTITY:
-      f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IDENTITY>(std::forward<Ts>(args)...);
     case ast_operator::IS_NULL:
-      f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::IS_NULL>(std::forward<Ts>(args)...);
     case ast_operator::SIN:
-      f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SIN>(std::forward<Ts>(args)...);
     case ast_operator::COS:
-      f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COS>(std::forward<Ts>(args)...);
     case ast_operator::TAN:
-      f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TAN>(std::forward<Ts>(args)...);
     case ast_operator::ARCSIN:
-      f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSIN>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOS:
-      f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOS>(std::forward<Ts>(args)...);
     case ast_operator::ARCTAN:
-      f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTAN>(std::forward<Ts>(args)...);
     case ast_operator::SINH:
-      f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SINH>(std::forward<Ts>(args)...);
     case ast_operator::COSH:
-      f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::COSH>(std::forward<Ts>(args)...);
     case ast_operator::TANH:
-      f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::TANH>(std::forward<Ts>(args)...);
     case ast_operator::ARCSINH:
-      f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCSINH>(std::forward<Ts>(args)...);
     case ast_operator::ARCCOSH:
-      f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCCOSH>(std::forward<Ts>(args)...);
     case ast_operator::ARCTANH:
-      f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ARCTANH>(std::forward<Ts>(args)...);
     case ast_operator::EXP:
-      f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::EXP>(std::forward<Ts>(args)...);
     case ast_operator::LOG:
-      f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::LOG>(std::forward<Ts>(args)...);
     case ast_operator::SQRT:
-      f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::SQRT>(std::forward<Ts>(args)...);
     case ast_operator::CBRT:
-      f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CBRT>(std::forward<Ts>(args)...);
     case ast_operator::CEIL:
-      f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CEIL>(std::forward<Ts>(args)...);
     case ast_operator::FLOOR:
-      f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::FLOOR>(std::forward<Ts>(args)...);
     case ast_operator::ABS:
-      f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::ABS>(std::forward<Ts>(args)...);
     case ast_operator::RINT:
-      f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::RINT>(std::forward<Ts>(args)...);
     case ast_operator::BIT_INVERT:
-      f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::BIT_INVERT>(std::forward<Ts>(args)...);
     case ast_operator::NOT:
-      f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::NOT>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_INT64:
-      f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_INT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_UINT64:
-      f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_UINT64>(std::forward<Ts>(args)...);
     case ast_operator::CAST_TO_FLOAT64:
-      f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
-      break;
+      return f.template operator()<ast_operator::CAST_TO_FLOAT64>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Invalid operator.");
@@ -955,231 +907,6 @@ struct operator_functor<ast_operator::NULL_LOGICAL_OR, true> {
   }
 };
 
-/**
- * @brief Functor used to single-type-dispatch binary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
- * the same type, and dispatches based on the type of the left input.
- *
- * @tparam OperatorFunctor Binary operator functor.
- */
-template <typename OperatorFunctor>
-struct single_dispatch_binary_operator_types {
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
-  }
-
-  template <typename LHS,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a binary operator.
- *
- * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
- * for all binary operators but holds for all currently implemented operators.
- */
-struct type_dispatch_binary_op {
-  /**
-   * @brief Performs type dispatch for a binary operator.
-   *
-   * @tparam op AST operator.
-   * @tparam F Type of forwarded functor.
-   * @tparam Ts Parameter pack of forwarded arguments.
-   * @param lhs_type Type of left input data.
-   * @param rhs_type Type of right input data.
-   * @param f Forwarded functor to be called.
-   * @param args Forwarded arguments to `operator()` of `f`.
-   */
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type,
-                                          cudf::data_type rhs_type,
-                                          F&& f,
-                                          Ts&&... args)
-  {
-    // Single dispatch (assume lhs_type == rhs_type)
-    type_dispatcher(
-      lhs_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime binary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param lhs_type Type of left input data.
- * @param rhs_type Type of right input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher(
-  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_binary_op{},
-                          lhs_type,
-                          rhs_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor used to type-dispatch unary operators.
- *
- * This functor's `operator()` is templated to validate calls to its operators based on the input
- * type, as determined by the `is_valid_unary_op` trait.
- *
- * @tparam OperatorFunctor Unary operator functor.
- */
-template <typename OperatorFunctor>
-struct dispatch_unary_operator_types {
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
-  }
-
-  template <typename InputT,
-            typename F,
-            typename... Ts,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation.");
-#endif
-  }
-};
-
-/**
- * @brief Functor performing a type dispatch for a unary operator.
- */
-struct type_dispatch_unary_op {
-  template <ast_operator op, typename F, typename... Ts>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
-  {
-    type_dispatcher(
-      input_type,
-      // Always dispatch to the non-null operator for the purpose of type determination.
-      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
-      std::forward<F>(f),
-      std::forward<Ts>(args)...);
-  }
-};
-
-/**
- * @brief Dispatches a runtime unary operator to a templated type dispatcher.
- *
- * @tparam F Type of forwarded functor.
- * @tparam Ts Parameter pack of forwarded arguments.
- * @param input_type Type of input data.
- * @param f Forwarded functor to be called.
- * @param args Forwarded arguments to `operator()` of `f`.
- */
-template <typename F, typename... Ts>
-CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op,
-                                                                 cudf::data_type input_type,
-                                                                 F&& f,
-                                                                 Ts&&... args)
-{
-  ast_operator_dispatcher(op,
-                          detail::type_dispatch_unary_op{},
-                          input_type,
-                          std::forward<F>(f),
-                          std::forward<Ts>(args)...);
-}
-
-/**
- * @brief Functor to determine the return type of an operator from its input types.
- */
-struct return_type_functor {
-  /**
-   * @brief Callable for binary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam LHS Left input type.
-   * @tparam RHS Right input type.
-   * @param result Reference whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename LHS,
-            typename RHS,
-            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
-#endif
-  }
-
-  /**
-   * @brief Callable for unary operators to determine return type.
-   *
-   * @tparam OperatorFunctor Operator functor to perform.
-   * @tparam T Input type.
-   * @param result Pointer whose value is assigned to the result data type.
-   */
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
-    result    = cudf::data_type(cudf::type_to_id<Out>());
-  }
-
-  template <typename OperatorFunctor,
-            typename T,
-            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
-  CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result)
-  {
-#ifndef __CUDA_ARCH__
-    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
-#else
-    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
-#endif
-  }
-};
-
 /**
  * @brief Gets the return type of an AST operator.
  *
@@ -1187,34 +914,8 @@ struct return_type_functor {
  * @param operand_types Vector of input types to the operator.
  * @return cudf::data_type Return type of the operator.
  */
-inline cudf::data_type ast_operator_return_type(ast_operator op,
-                                                std::vector<cudf::data_type> const& operand_types)
-{
-  auto result = cudf::data_type(cudf::type_id::EMPTY);
-  switch (operand_types.size()) {
-    case 1:
-      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
-      break;
-    case 2:
-      binary_operator_dispatcher(
-        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
-      break;
-    default: CUDF_FAIL("Unsupported operator return type."); break;
-  }
-  return result;
-}
-
-/**
- * @brief Functor to determine the arity (number of operands) of an operator.
- */
-struct arity_functor {
-  template <ast_operator op>
-  CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result)
-  {
-    // Arity is not dependent on null handling, so just use the false implementation here.
-    result = operator_functor<op, false>::arity;
-  }
-};
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types);
 
 /**
  * @brief Gets the arity (number of operands) of an AST operator.
@@ -1222,12 +923,7 @@ struct arity_functor {
  * @param op Operator used to determine arity.
  * @return Arity of the operator.
  */
-CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
-{
-  auto result = cudf::size_type(0);
-  ast_operator_dispatcher(op, detail::arity_functor{}, result);
-  return result;
-}
+cudf::size_type ast_operator_arity(ast_operator op);
 
 }  // namespace detail
 
diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp
index d0e4c59ca54..b2cc134d9fa 100644
--- a/cpp/src/ast/expression_parser.cpp
+++ b/cpp/src/ast/expression_parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,6 +161,7 @@ cudf::size_type expression_parser::visit(operation const& expr)
   auto const op        = expr.get_operator();
   auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types);
   _operators.push_back(op);
+  _operator_arities.push_back(cudf::ast::detail::ast_operator_arity(op));
   // Push data reference
   auto const output = [&]() {
     if (expression_index == 0) {
diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp
new file mode 100644
index 00000000000..b60a69a42d9
--- /dev/null
+++ b/cpp/src/ast/operators.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <cuda/std/type_traits>
+#include <thrust/optional.h>
+
+#include <vector>
+
+namespace cudf {
+namespace ast {
+namespace detail {
+namespace {
+
+struct arity_functor {
+  template <ast_operator op>
+  void operator()(cudf::size_type& result)
+  {
+    // Arity is not dependent on null handling, so just use the false implementation here.
+    result = operator_functor<op, false>::arity;
+  }
+};
+
+/**
+ * @brief Functor to determine the return type of an operator from its input types.
+ */
+struct return_type_functor {
+  /**
+   * @brief Callable for binary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam LHS Left input type.
+   * @tparam RHS Right input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, LHS, RHS>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename LHS,
+            typename RHS,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, RHS>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+
+  /**
+   * @brief Callable for unary operators to determine return type.
+   *
+   * @tparam OperatorFunctor Operator functor to perform.
+   * @tparam T Input type.
+   * @param result Pointer whose value is assigned to the result data type.
+   */
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+    using Out = cuda::std::invoke_result_t<OperatorFunctor, T>;
+    result    = cudf::data_type{cudf::type_to_id<Out>()};
+  }
+
+  template <typename OperatorFunctor,
+            typename T,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, T>>* = nullptr>
+  void operator()(cudf::data_type& result)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation. Return type cannot be determined.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined.");
+#endif
+    result = cudf::data_type{cudf::type_id::EMPTY};
+  }
+};
+
+/**
+ * @brief Functor used to single-type-dispatch binary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are
+ * the same type, and dispatches based on the type of the left input.
+ *
+ * @tparam OperatorFunctor Binary operator functor.
+ */
+template <typename OperatorFunctor>
+struct single_dispatch_binary_operator_types {
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, LHS, LHS>(std::forward<Ts>(args)...);
+  }
+
+  template <typename LHS,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_binary_op<OperatorFunctor, LHS, LHS>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid binary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid binary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a binary operator.
+ *
+ * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true
+ * for all binary operators but holds for all currently implemented operators.
+ */
+struct type_dispatch_binary_op {
+  /**
+   * @brief Performs type dispatch for a binary operator.
+   *
+   * @tparam op AST operator.
+   * @tparam F Type of forwarded functor.
+   * @tparam Ts Parameter pack of forwarded arguments.
+   * @param lhs_type Type of left input data.
+   * @param rhs_type Type of right input data.
+   * @param f Forwarded functor to be called.
+   * @param args Forwarded arguments to `operator()` of `f`.
+   */
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+  {
+    // Single dispatch (assume lhs_type == rhs_type)
+    type_dispatcher(
+      lhs_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::single_dispatch_binary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime binary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param lhs_type Type of left input data.
+ * @param rhs_type Type of right input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void binary_operator_dispatcher(
+  ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_binary_op{},
+                          lhs_type,
+                          rhs_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+/**
+ * @brief Functor used to type-dispatch unary operators.
+ *
+ * This functor's `operator()` is templated to validate calls to its operators based on the input
+ * type, as determined by the `is_valid_unary_op` trait.
+ *
+ * @tparam OperatorFunctor Unary operator functor.
+ */
+template <typename OperatorFunctor>
+struct dispatch_unary_operator_types {
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+    f.template operator()<OperatorFunctor, InputT>(std::forward<Ts>(args)...);
+  }
+
+  template <typename InputT,
+            typename F,
+            typename... Ts,
+            std::enable_if_t<!is_valid_unary_op<OperatorFunctor, InputT>>* = nullptr>
+  inline void operator()(F&& f, Ts&&... args)
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_FAIL("Invalid unary operation.");
+#else
+    CUDF_UNREACHABLE("Invalid unary operation.");
+#endif
+  }
+};
+
+/**
+ * @brief Functor performing a type dispatch for a unary operator.
+ */
+struct type_dispatch_unary_op {
+  template <ast_operator op, typename F, typename... Ts>
+  inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args)
+  {
+    type_dispatcher(
+      input_type,
+      // Always dispatch to the non-null operator for the purpose of type determination.
+      detail::dispatch_unary_operator_types<operator_functor<op, false>>{},
+      std::forward<F>(f),
+      std::forward<Ts>(args)...);
+  }
+};
+
+/**
+ * @brief Dispatches a runtime unary operator to a templated type dispatcher.
+ *
+ * @tparam F Type of forwarded functor.
+ * @tparam Ts Parameter pack of forwarded arguments.
+ * @param input_type Type of input data.
+ * @param f Forwarded functor to be called.
+ * @param args Forwarded arguments to `operator()` of `f`.
+ */
+template <typename F, typename... Ts>
+inline constexpr void unary_operator_dispatcher(ast_operator op,
+                                                cudf::data_type input_type,
+                                                F&& f,
+                                                Ts&&... args)
+{
+  ast_operator_dispatcher(op,
+                          detail::type_dispatch_unary_op{},
+                          input_type,
+                          std::forward<F>(f),
+                          std::forward<Ts>(args)...);
+}
+
+}  // namespace
+
+cudf::data_type ast_operator_return_type(ast_operator op,
+                                         std::vector<cudf::data_type> const& operand_types)
+{
+  cudf::data_type result{cudf::type_id::EMPTY};
+  switch (operand_types.size()) {
+    case 1:
+      unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result);
+      break;
+    case 2:
+      binary_operator_dispatcher(
+        op, operand_types[0], operand_types[1], detail::return_type_functor{}, result);
+      break;
+    default: CUDF_FAIL("Unsupported operator return type."); break;
+  }
+  return result;
+}
+
+cudf::size_type ast_operator_arity(ast_operator op)
+{
+  cudf::size_type result{};
+  ast_operator_dispatcher(op, arity_functor{}, result);
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace ast
+
+}  // namespace cudf

From f84cd4316eaa61e231b5fd096608ca09d5e3c08c Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:26:43 -0500
Subject: [PATCH 10/12] [BUG] xfail Polars excel test (#17731)

One the Polars tests fails when `fastexcel>=0.12.1`. I opened https://github.com/pola-rs/polars/issues/20698 to track that failing test. This PR xfail that test for now.

xref #17677

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/17731
---
 python/cudf_polars/cudf_polars/testing/plugin.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index c16df320ceb..e453a8b89b9 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -8,7 +8,9 @@
 from functools import partialmethod
 from typing import TYPE_CHECKING
 
+import fastexcel
 import pytest
+from packaging import version
 
 import polars
 
@@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None:
     )
 
 
-EXPECTED_FAILURES: Mapping[str, str] = {
+EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed",
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
@@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None:
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
+    "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
+        "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
+        version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
+    ),
 }
 
 
@@ -219,4 +225,12 @@ def pytest_collection_modifyitems(
         if item.nodeid in TESTS_TO_SKIP:
             item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
         elif item.nodeid in EXPECTED_FAILURES:
+            if isinstance(EXPECTED_FAILURES[item.nodeid], tuple):
+                # the second entry in the tuple is the condition to xfail on
+                item.add_marker(
+                    pytest.mark.xfail(
+                        condition=EXPECTED_FAILURES[item.nodeid][1],
+                        reason=EXPECTED_FAILURES[item.nodeid][0],
+                    ),
+                )
             item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))

From 253fb2f10e921519502e562672d29029e844c2cf Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 13 Jan 2025 22:41:47 -0800
Subject: [PATCH 11/12] Require to implement `AutoCloseable` for the classes
 derived from `HostUDFWrapper` (#17727)

This adds the requirement to implement `AutoCloseable`  to the classes derived from `HostUDFWrapper`, forcing them to delete the native UDF instance upon class destruction. Doing so will fix the memory leak issue when the native UDF instance never being destroyed.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/17727
---
 java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
index 0b6ecf2e140..124f2c99188 100644
--- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
+++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
@@ -24,8 +24,10 @@
  * <p>
  * A new host UDF aggregation implementation must extend this class and override the
  * {@code hashCode} and {@code equals} methods for such purposes.
+ * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must
+ * also be overridden to automatically delete the native UDF instance upon class destruction.
  */
-public abstract class HostUDFWrapper {
+public abstract class HostUDFWrapper implements AutoCloseable {
   public final long udfNativeHandle;
 
   public HostUDFWrapper(long udfNativeHandle) {

From 847029172ce47ef2109dc825f149ab58130a2fc6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Jan 2025 09:18:13 -0600
Subject: [PATCH 12/12] convert all nulls to nans in a specific scenario
 (#17677)

Fixes: #17666

This PR ensures we convert all nulls to nan's in float columns only in pandas compatibility mode.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17677
---
 python/cudf/cudf/core/column/column.py |  4 ++++
 python/cudf/cudf/tests/test_series.py  | 11 ++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 30da8727366..19f2802553d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2413,7 +2413,11 @@ def as_column(
                 and pa.types.is_integer(arbitrary.type)
                 and arbitrary.null_count > 0
             ):
+                # TODO: Need to re-visit this cast and fill_null
+                # calls while addressing the following issue:
+                # https://github.com/rapidsai/cudf/issues/14149
                 arbitrary = arbitrary.cast(pa.float64())
+                arbitrary = pc.fill_null(arbitrary, np.nan)
             if (
                 cudf.get_option("default_integer_bitwidth")
                 and pa.types.is_integer(arbitrary.type)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f8697c5c6b8..891c0ede9a4 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 import datetime
 import decimal
 import hashlib
@@ -3003,3 +3003,12 @@ def test_dtype_dtypes_equal():
     ser = cudf.Series([0])
     assert ser.dtype is ser.dtypes
     assert ser.dtypes is ser.to_pandas().dtypes
+
+
+def test_null_like_to_nan_pandas_compat():
+    with cudf.option_context("mode.pandas_compatible", True):
+        ser = cudf.Series([1, 2, np.nan, 10, None])
+        pser = pd.Series([1, 2, np.nan, 10, None])
+
+        assert pser.dtype == ser.dtype
+        assert_eq(ser, pser)