From 231015910bae375077e07c01d2bf70697182ccad Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 9 Jan 2025 16:53:25 -0500 Subject: [PATCH 01/12] Support multithreaded reading of compressed buffers in JSON reader (#17670) Addresses #17638 This PR introduces multithreaded host-side decompression of compressed input buffers passed to the JSON reader, and uses a stream pool to transfer the uncompressed buffers to device. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17670 --- cpp/src/io/json/read_json.cu | 71 +++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 82d8152ca1c..113342e9cbf 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -37,12 +38,25 @@ #include #include +#include +#include + #include namespace cudf::io::json::detail { namespace { +namespace pools { + +BS::thread_pool& tpool() +{ + static BS::thread_pool _tpool(std::thread::hardware_concurrency()); + return _tpool; +} + +} // namespace pools + class compressed_host_buffer_source final : public datasource { public: explicit compressed_host_buffer_source(std::unique_ptr const& src, @@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource { { auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), _dbuf_ptr->size()); - if (comptype == compression_type::GZIP || comptype == compression_type::ZIP || - comptype == compression_type::SNAPPY) { + if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP || + _comptype == compression_type::SNAPPY) { _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer); } else { _decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer); @@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource { return std::make_unique(_decompressed_buffer.data() + offset, count); } - [[nodiscard]] bool supports_device_read() const override { return false; } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + auto& thread_pool = pools::tpool(); + return thread_pool.submit_task([this, offset, size, dst, stream] { + auto hbuf = host_read(offset, size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + return hbuf->size(); + }); + } + + [[nodiscard]] bool supports_device_read() const override { return true; } [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; } @@ -431,6 +460,8 @@ device_span ingest_raw_input(device_span buffer, // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line // delimiter. auto constexpr num_delimiter_chars = 1; + std::vector> thread_tasks; + auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count()); auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); @@ -447,13 +478,17 @@ device_span ingest_raw_input(device_span buffer, auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { + for (std::size_t i = start_source, cur_stream = 0; + i < sources.size() && bytes_read < total_bytes_to_read; + i++) { if (sources[i]->is_empty()) continue; auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); auto destination = reinterpret_cast(buffer.data()) + bytes_read + (num_delimiter_chars * delimiter_map.size()); - if (sources[i]->is_device_read_preferred(data_size)) { - bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); + if (sources[i]->supports_device_read()) { + thread_tasks.emplace_back(sources[i]->device_read_async( + range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()])); + bytes_read += data_size; } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); auto const& h_buffer = h_buffers.back(); @@ -481,6 +516,15 @@ device_span ingest_raw_input(device_span buffer, buffer.data()); } stream.synchronize(); + + if (thread_tasks.size()) { + auto const bytes_read = std::accumulate( + thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) { + return sum + task.get(); + }); + CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy"); + } + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); } @@ -505,10 +549,17 @@ table_with_metadata read_json(host_span> sources, return read_json_impl(sources, reader_opts, stream, mr); std::vector> compressed_sources; - for (size_t i = 0; i < sources.size(); i++) { - compressed_sources.emplace_back( - std::make_unique(sources[i], reader_opts.get_compression())); + std::vector>> thread_tasks; + auto& thread_pool = pools::tpool(); + for (auto& src : sources) { + thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] { + return std::make_unique(src, reader_opts.get_compression()); + })); } + std::transform(thread_tasks.begin(), + thread_tasks.end(), + std::back_inserter(compressed_sources), + [](auto& task) { return task.get(); }); // in read_json_impl, we need the compressed source size to actually be the // uncompressed source size for correct batching return read_json_impl(compressed_sources, reader_opts, stream, mr); From a8a41975b0c1cfaedb7d4461ee027f6f9ff75b0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:16:04 -0800 Subject: [PATCH 02/12] Remove cudf._libs.types.pyx (#17665) Contributes to https://github.com/rapidsai/cudf/issues/17317 1. Moves some Python routines/objects to `cudf/utils/dtypes.py` 2. Moves specific column only routines directly to `cudf/_libs/column.pyx` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17665 --- python/cudf/cudf/_lib/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/column.pxd | 4 +- python/cudf/cudf/_lib/column.pyx | 100 ++++++++-- python/cudf/cudf/_lib/scalar.pyx | 49 ++--- python/cudf/cudf/_lib/types.pxd | 11 -- python/cudf/cudf/_lib/types.pyx | 172 ------------------ python/cudf/cudf/core/_base_index.py | 9 +- .../cudf/cudf/core/_internals/aggregation.py | 4 +- python/cudf/cudf/core/_internals/binaryop.py | 4 +- python/cudf/cudf/core/_internals/unary.py | 4 +- python/cudf/cudf/core/column/categorical.py | 10 +- python/cudf/cudf/core/column/column.py | 27 +-- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/column/string.py | 12 +- python/cudf/cudf/core/copy_types.py | 6 +- python/cudf/cudf/core/dtypes.py | 5 +- python/cudf/cudf/core/groupby/groupby.py | 21 +-- python/cudf/cudf/core/index.py | 6 +- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/join/join.py | 6 +- python/cudf/cudf/core/multiindex.py | 11 +- python/cudf/cudf/core/reshape.py | 9 +- python/cudf/cudf/io/csv.py | 8 +- python/cudf/cudf/io/json.py | 8 +- python/cudf/cudf/io/orc.py | 4 +- python/cudf/cudf/utils/dtypes.py | 66 ++++++- 26 files changed, 251 insertions(+), 316 deletions(-) delete mode 100644 python/cudf/cudf/_lib/types.pxd delete mode 100644 python/cudf/cudf/_lib/types.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ff6fba1c3e8..ec44a6aa8c5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8b1d16f0d85..026c12895e8 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +cdef dtype_from_lists_column_view(column_view cv) +cdef dtype_from_column_view(column_view cv) cdef class Column: cdef public: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f7dcd89ea48..c59bbc0f40c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -19,24 +19,21 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype +from cudf.utils.dtypes import ( + _get_base_dtype, + dtype_to_pylibcudf_type, + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, +) from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr +from libc.stdint cimport uintptr_t, int32_t +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_pylibcudf_type, -) - -from cudf._lib.types import dtype_from_pylibcudf_column - -from pylibcudf cimport DataType as plc_DataType +from pylibcudf cimport DataType as plc_DataType, Column as plc_Column cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index): ) +def dtype_from_pylibcudf_column(plc_Column col not None): + type_ = col.type() + tid = type_.id() + + if tid == pylibcudf.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == pylibcudf.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == pylibcudf.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + +cdef dtype_from_lists_column_view(column_view cv): + # lists_column_view have no default constructor, so we heap + # allocate it to get around Cython's limitation of requiring + # default constructors for stack allocated objects + cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) + cdef column_view child = lv.get()[0].child() + + if child.type().id() == libcudf_types.type_id.LIST: + return cudf.ListDtype(dtype_from_lists_column_view(child)) + else: + return cudf.ListDtype(dtype_from_column_view(child)) + + +cdef dtype_from_column_view(column_view cv): + cdef libcudf_types.type_id tid = cv.type().id() + if tid == libcudf_types.type_id.LIST: + return dtype_from_lists_column_view(cv) + elif tid == libcudf_types.type_id.STRUCT: + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) + elif tid == libcudf_types.type_id.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] + + cdef class Column: """ A Column stores columnar data in device memory. @@ -361,7 +433,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -424,7 +496,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index fd6d0257940..65607c91302 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import copy @@ -14,17 +14,16 @@ import pylibcudf as plc import cudf from cudf.core.dtypes import ListDtype, StructDtype -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar +from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -223,40 +222,22 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef plc_TypeID cdtype_id = self.c_value.type().id() + cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype elif cdtype_id in { - plc_TypeID.DECIMAL32, - plc_TypeID.DECIMAL64, - plc_TypeID.DECIMAL128, + plc.TypeID.DECIMAL32, + plc.TypeID.DECIMAL64, + plc.TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype_id == plc_TypeID.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype_id == plc_TypeID.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == plc_TypeID.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) + elif cdtype_id == plc.TypeID.STRUCT: + self._dtype = StructDtype.from_arrow( + plc.interop.to_arrow(self.c_value).type + ) + elif cdtype_id == plc.TypeID.LIST: + self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type) else: - self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype_id) - ] + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index 18b1d26e4db..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from pylibcudf.libcudf.column.column_view cimport column_view - -ctypedef int32_t underlying_type_t_type_id - -cdef dtype_from_column_view(column_view cv) - -cpdef dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index 777bd070b32..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -import pylibcudf as plc - -import cudf - - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - np.dtype("int8"): plc.types.TypeId.INT8, - np.dtype("int16"): plc.types.TypeId.INT16, - np.dtype("int32"): plc.types.TypeId.INT32, - np.dtype("int64"): plc.types.TypeId.INT64, - np.dtype("uint8"): plc.types.TypeId.UINT8, - np.dtype("uint16"): plc.types.TypeId.UINT16, - np.dtype("uint32"): plc.types.TypeId.UINT32, - np.dtype("uint64"): plc.types.TypeId.UINT64, - np.dtype("float32"): plc.types.TypeId.FLOAT32, - np.dtype("float64"): plc.types.TypeId.FLOAT64, - np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): plc.types.TypeId.STRING, - np.dtype("bool"): plc.types.TypeId.BOOL8, - np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, -} -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - plc_type: np_type - for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() -} -# There's no equivalent to EMPTY in cudf. We translate EMPTY -# columns from libcudf to ``int8`` columns of all nulls in Python. -# ``int8`` is chosen because it uses the least amount of memory. -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") - - -size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") - else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] - - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return plc.DataType(plc.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return plc.DataType(plc.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = plc.TypeId.DECIMAL128 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = plc.TypeId.DECIMAL64 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = plc.TypeId.DECIMAL32 - return plc.DataType(tid, -dtype.scale) - # libcudf types don't support timezones so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f" ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) + gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e23ca810065..30da8727366 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -60,9 +59,11 @@ from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_to_pylibcudf_type, find_common_type, get_time_unit, is_column_like, @@ -874,7 +875,7 @@ def indices_of( value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: @@ -954,7 +955,7 @@ def take( # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) + indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] @@ -1743,9 +1744,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1754,21 +1753,16 @@ def column_empty( cudf.core.column.NumericalColumn( data=as_buffer( rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize + size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize ) ), size=None, - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) + children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -2552,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6fc2b5d4ca2..04b4003c510 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column @@ -22,6 +21,7 @@ from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from collections.abc import Sequence @@ -258,7 +258,7 @@ def from_sequences( offset_col = cast( NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), + column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE), ) # Build ListColumn diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 20eded9a27f..2bee85cb387 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,16 +19,18 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -5611,7 +5613,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype + 0, length=size + 1, dtype=SIZE_TYPE_DTYPE ) children = (offsets,) @@ -5888,7 +5890,7 @@ def as_decimal_column( ) -> cudf.core.column.DecimalBaseColumn: plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( self.to_pylibcudf(mode="read"), - libcudf.types.dtype_to_pylibcudf_type(dtype), + dtype_to_pylibcudf_type(dtype), ) result = Column.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4b6ad59c8e1..aaaf6c7ee4f 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,11 +1,11 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from cudf.core.column import NumericalColumn @@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # Alternately we can have an Optional[Column] and handle None # specially in _gather. self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) + "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE) ) else: if self.column.dtype.kind not in {"i", "u"}: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ed233ba737..ce7fb968069 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import decimal @@ -57,7 +57,8 @@ def dtype(arbitrary): if np_dtype.kind in set("OU"): return np.dtype("object") elif ( - np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + np_dtype + not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 17302311a7e..7bc4b08fc49 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,7 +21,6 @@ import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -46,7 +45,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.dtypes import cudf_dtype_to_pa_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -588,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype + range(len(self.obj)), dtype=SIZE_TYPE_DTYPE ) ] ) @@ -1185,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -1199,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) + to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -1500,11 +1499,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape + SIZE_TYPE_DTYPE.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -1517,7 +1516,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + ).astype(SIZE_TYPE_DTYPE) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1525,7 +1524,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) + indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -1533,7 +1532,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) + indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -1557,7 +1556,7 @@ def sample( indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) + want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b535e8aabd2..0d1bf552982 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -53,6 +52,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=size_type_dtype) + return as_column(i, dtype=SIZE_TYPE_DTYPE) def isin(self, values, level=None): if level is not None and level > 0: @@ -1348,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = as_column( -1, length=len(needle), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index eded681baf0..4c6f8a9c152 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -60,6 +60,7 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -3034,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: NumericalColumn, as_column( range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ), len(self), diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6e965ceca66..ce7edc8fdbe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any @@ -7,7 +7,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap @@ -17,6 +16,7 @@ _IndexIndexer, _match_join_keys, ) +from cudf.utils.dtypes import SIZE_TYPE_DTYPE class Merge: @@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols): # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. key_order = [ - cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) for map_, n, null in zip(maps, lengths, nullify) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e7efd01ca85..64ec099cb39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,6 @@ import cudf import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -34,7 +33,7 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -199,7 +198,7 @@ def __init__( ) if lo == -1: # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min + code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min result_col = level._column.take(code, nullify=True) source_data[i] = result_col._with_type_metadata(level.dtype) @@ -1578,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index: def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code + # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code # pandas uses -1 as missing code pd_codes = ( code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1), column.as_column(-1, length=1), ) for code in self._codes @@ -1903,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = column.as_column( -1, length=len(target), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): return _return_get_indexer_result(result.values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0abd42d4d4e..eedd777aafe 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -12,13 +12,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type if TYPE_CHECKING: from cudf._typing import Dtype @@ -1333,10 +1332,10 @@ def _one_hot_encode_column( else: column = column._get_decategorized_column() # type: ignore[attr-defined] - if column.size * categories.size >= np.iinfo(size_type_dtype).max: + if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max: raise ValueError( "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " + f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing " "size of category" ) result_labels = ( diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 6d617cbf38e..7e8468c8e8a 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import errno @@ -16,11 +16,13 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) from cudf.utils.performance_tracking import _performance_tracking _CSV_HEX_TYPE_MAP = { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ff326e09315..16c7d189dfd 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import os @@ -14,10 +14,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from cudf.core.column import ColumnBase diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f3124552fd1..0ac2950a22b 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -11,11 +11,11 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.index import _index_from_data from cudf.utils import ioutils +from cudf.utils.dtypes import dtype_to_pylibcudf_type try: import ujson as json # type: ignore[import-untyped] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 31a8f4de3b3..9e932acb5fa 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime @@ -11,6 +11,8 @@ import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object +import pylibcudf as plc + import cudf if TYPE_CHECKING: @@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) @@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def dtype_to_pylibcudf_type(dtype) -> plc.DataType: + if isinstance(dtype, cudf.ListDtype): + return plc.DataType(plc.TypeId.LIST) + elif isinstance(dtype, cudf.StructDtype): + return plc.DataType(plc.TypeId.STRUCT) + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type + elif isinstance(dtype, pd.DatetimeTZDtype): + dtype = _get_base_dtype(dtype) + else: + dtype = np.dtype(dtype) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) + + +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, +} +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() +} +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype( + "object" +) +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") + + +SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ From 559cda24e4258da1aa35b7de60f46e8a86b1effa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:18:27 -0800 Subject: [PATCH 03/12] Use 64-bit offsets only if the current strings column output chunk size exceeds threshold (#17693) This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17693 --- cpp/src/io/parquet/reader_impl.cpp | 48 +++++++-------------- cpp/src/io/parquet/reader_impl_chunking.hpp | 5 +-- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c48ff896e33..f9fcca6bb4f 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } - // Compute column string sizes (using page string offsets) for this subpass + // Compute column string sizes (using page string offsets) for this output table chunk col_string_sizes = calculate_page_string_offsets(); - // ensure cumulative column string sizes have been initialized - if (pass.cumulative_col_string_sizes.empty()) { - pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); - } - - // Add to the cumulative column string sizes of this pass - std::transform(pass.cumulative_col_string_sizes.begin(), - pass.cumulative_col_string_sizes.end(), - col_string_sizes.begin(), - pass.cumulative_col_string_sizes.begin(), - std::plus<>{}); - // Check for overflow in cumulative column string sizes of this pass so that the page string // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), - pass.cumulative_col_string_sizes.cend(), + auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), + col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // Mark any chunks for which the cumulative column string size has exceeded the - // large strings threshold - if (has_large_strings) { - for (auto& chunk : pass.chunks) { - auto const idx = chunk.src_col_index; - if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } - } + // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output + // column chunks and the large strings threshold. + for (auto& chunk : pass.chunks) { + auto const idx = chunk.src_col_index; + chunk.is_large_string_col = (col_string_sizes[idx] > threshold); } } @@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data( - col_string_sizes[pass.chunks[c].src_col_index], - pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > - static_cast(strings::detail::get_offset64_threshold()), - _stream); + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], + pass.chunks[c].is_large_string_col, + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { - // need to cap off the string offsets column - auto const sz = static_cast(col_string_sizes[idx]); - if (sz <= strings::detail::get_offset64_threshold()) { + // only if it is not a large strings column + if (col_string_sizes[idx] <= + static_cast(strings::detail::get_offset64_threshold())) { out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); - final_offsets.emplace_back(sz); + final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } } } diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index ca46f198bb8..4a773fbced1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,9 +130,6 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; - // cumulative strings column sizes. - std::vector cumulative_col_string_sizes{}; - int level_type_size{0}; // skip_rows / num_rows for this pass. From fb2413e1505297e737095d97e0732eec52519802 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Jan 2025 10:06:35 -0800 Subject: [PATCH 04/12] Make tests build without relaxed constexpr (#17691) Contributes to https://github.com/rapidsai/cudf/issues/7795 This PR updates tests to build without depending on the relaxed constexpr build option. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17691 --- .../cudf/detail/utilities/integer_utils.hpp | 12 +++-- cpp/include/cudf/utilities/span.hpp | 40 +++++++++------ cpp/src/io/utilities/parsing_utils.cuh | 49 ++++++++++--------- cpp/src/io/utilities/trie.cuh | 8 ++- .../transform/segmented_row_bit_count_test.cu | 4 +- cpp/tests/utilities/column_utilities.cu | 18 ++++--- 6 files changed, 75 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 2e3d71815c0..44a86f1c84f 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ */ #include +#include +#include #include #include @@ -44,13 +46,17 @@ namespace util { * `modulus` is positive. The safety is in regard to rollover. */ template -constexpr S round_up_safe(S number_to_round, S modulus) +CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error); +#else + CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value"); +#endif } return rounded_up; } diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index e7b76946248..b5044a58934 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_basedata() + offset, count, _is_device_accessible}; } @@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_basedata() + offset, count}; } @@ -475,28 +480,28 @@ class base_2dspan { * * @return A pointer to the first element of the span */ - [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); } /** * @brief Returns the size in the span as pair. * * @return pair representing rows and columns size of the span */ - [[nodiscard]] constexpr auto size() const noexcept { return _size; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; } /** * @brief Returns the number of elements in the span. * * @return Number of elements in the span */ - [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); } /** * @brief Checks if the span is empty. * * @return True if the span is empty, false otherwise */ - [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; } /** * @brief Returns a reference to the row-th element of the sequence. @@ -507,7 +512,7 @@ class base_2dspan { * @param row the index of the element to access * @return A reference to the row-th element of the sequence, i.e., `data()[row]` */ - constexpr RowType operator[](size_t row) const + CUDF_HOST_DEVICE constexpr RowType operator[](size_t row) const { return _flat.subspan(row * _size.second, _size.second); } @@ -517,7 +522,10 @@ class base_2dspan { * * @return A flattened span of the 2D span */ - [[nodiscard]] constexpr RowType flat_view() const { return _flat; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr RowType flat_view() const + { + return _flat; + } /** * @brief Construct a 2D span from another 2D span of convertible type diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 75e45a68842..9833dab282e 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,7 +171,10 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } +CUDF_HOST_DEVICE constexpr char to_lower(char const c) +{ + return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; +} /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -515,13 +518,13 @@ struct ConvertFunctor { template and !std::is_same_v and !cudf::is_fixed_point())> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex = false) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex = false) { auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -564,13 +567,13 @@ struct ConvertFunctor { * @brief Dispatch for boolean type types. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -593,13 +596,13 @@ struct ConvertFunctor { * is not valid. In such case, the validity mask is set to zero too. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh index c0efc5b6f20..dbdc4a34277 100644 --- a/cpp/src/io/utilities/trie.cuh +++ b/cpp/src/io/utilities/trie.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,16 +74,14 @@ CUDF_EXPORT trie create_serialized_trie(std::vector const& keys, /* * @brief Searches for a string in a serialized trie. * - * Can be executed on host or device, as long as the data is available - * * @param trie Pointer to the array of nodes that make up the trie * @param key Pointer to the start of the string to find * @param key_len Length of the string to find * * @return Boolean value; true if string is found, false otherwise */ -CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span trie, - device_span key) +__device__ inline bool serialized_trie_contains(device_span trie, + device_span key) { if (trie.empty()) { return false; } if (key.empty()) { return trie.front().is_leaf; } diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu index 652b9053582..0e4f623f0a2 100644 --- a/cpp/tests/transform/segmented_row_bit_count_test.cu +++ b/cpp/tests/transform/segmented_row_bit_count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,7 +74,7 @@ compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type s // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. auto const size_begin = d_sizes + segment_idx * segment_length; - auto const size_end = std::min(size_begin + segment_length, d_sizes + num_rows); + auto const size_end = cuda::std::min(size_begin + segment_length, d_sizes + num_rows); return thrust::reduce(thrust::seq, size_begin, size_end); })); diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index fb9bdeb0b22..6888f26fd16 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include #include @@ -412,14 +414,16 @@ class corresponding_rows_not_equivalent { T const y = rhs.element(rhs_index); // Must handle inf and nan separately - if (std::isinf(x) || std::isinf(y)) { + if (cuda::std::isinf(x) || cuda::std::isinf(y)) { return x != y; // comparison of (inf==inf) returns true - } else if (std::isnan(x) || std::isnan(y)) { - return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false + } else if (cuda::std::isnan(x) || cuda::std::isnan(y)) { + return cuda::std::isnan(x) != + cuda::std::isnan(y); // comparison of (nan==nan) returns false } else { - T const abs_x_minus_y = std::abs(x - y); - return abs_x_minus_y >= std::numeric_limits::min() && - abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * fp_ulps; + T const abs_x_minus_y = cuda::std::abs(x - y); + return abs_x_minus_y >= cuda::std::numeric_limits::min() && + abs_x_minus_y > + cuda::std::numeric_limits::epsilon() * cuda::std::abs(x + y) * fp_ulps; } } else { // if either is null, then the inequality was checked already From dc2a75cba40d38f4a6ba66e652764e96fa6b593d Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Jan 2025 13:22:39 -0500 Subject: [PATCH 05/12] Add special orc test data: timestamp interspersed with null values (#17713) Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17713 --- ...e.timestamp.desynced.snappy.RLEv2.hasNull.orc | Bin 0 -> 5951 bytes ...stamp.desynced.uncompressed.RLEv2.hasNull.orc | Bin 0 -> 6565 bytes python/cudf/cudf/tests/test_orc.py | 6 ++++++ 3 files changed, 6 insertions(+) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..8772f84c3ba2a7942323f49ac28c3e5b172a91fc GIT binary patch literal 5951 zcmaKwdvH_dnZ_T?xR5lJv24X%8VbY$ylTk?@?*+;e|9od$AXd0?qoGRN57280S zG&aHsjtl{0T3Dr7sAwCiY&WdvCaltKsEEN4O+!UD5U~v)ifu$LL?qbAfQ=0HI_b{r z{ZBGzYY!Sz_4)7SG|3=|4MJiM<0FN zSjS?0h5h~`a{X7BdmWT4P*38k=9?dUW4rG+tVN&lqp#cRa1V?5YL@uTum4!Qy>{QtYjU2fOX;~l)?6%~6{T_DgDlxv67x}lc3 zAy@s-6WnDR^h1M&q1}x``^lk0+?Pj9L#LaDCYpz)%)6AIeq7qRrJUNLwrpu>+v4K= z^ofow8?0LfZCiHRx9sQNd&se6v~$bpt}PSYTc-H0z_qosXKOjLRlRy^i{L)jLtCHt z_SOv#Zyj8-b+_P^{qC)Y9@#qj=+@JZZJj8*Z>sktJn>TLcV8;^yrlj+p6z?QHGj`r z`xLK#Eq?WV-d->7z&hUHf8d??0si%A-j(&d*$sTb5BX(2{QEO}!?S$bkN9hzJ?*lAoP>fq!K)e|B)`(=QfX3kYOG0>c)8ZL46-OZae`pntny z`wqe0py0qT&hHeQ__5&JPXt$X31)ZW(mxiKg$iXa7aD$AXnUpbiGSL$Vb6}iy*qaQ zY{&kR6^Hii82$N<)4$j;@yi`kC1LCjmi{VO9to;n3$~PsUB3=K@h`y*uLlPY1b3H) z_eX<=4hBbm6FmKI!HIjsQ*RFA&~WL$50}S=)%Wac`R%alcf(Ko$MA-?hX+L~cmIBP ze|-4Rk>SyIhEI$3O&lGbI<^y|J4@f)S-$K6_3@o8f7t1IZ|4(#+_~XI(e^(T?M)UP zI9YV~&#cz}EIRi=(Unt0v!{y%AF}MJV%eEu!+#aq{!+Z=BUaDZ;{J2R+s_y8O&1@y z!1~T#i%(oEJ~vi;WxRNHg7w`?OUp7#WtW#4K3;13g!TPTm#+P6Y5$d_+dp5rH_Q6r z)uo5OSbE~irRT0Ky)wz#I8`FJUQ(7TkxiEvM3-K?S+XWGxpttO{;m>DQ$45T8cI-LhTNmH>atg+gv1<$c#^Ynk>oYIE z^A4-T>^Ghnd*kNJTO%XaCO2c~^0@N$%$cJjw; zTaNOk%mLG~i8r~kx_952+&x;GxV>j^OS0##Gn0?Hu^tWwW^d?28ip_4j`=@T1Ge9_*iaf6P4L+C8xK zW3y!A@)Kj$oHf*cf90|2jru>0Idj{7I`HdHnt6o}C&wPnz5h=Gzx(8va%10%k=|+k z-huZ%HETES`SaLQ6C1(fN+y zeEh-NGiQ(9y*)X$ba-K6@!Pe0biQz^cW7YM(DB0AhZFNFrpC7om@}*HnV9{j-KCr*k`>cefF5}tn;GvM@@fu#`xG%KN>mP*E@>mn$GtVn$`O~;+;NtYmhvUxYAG&8WDE9H8bQq`YQdE zfy!XP!OC!Dq%v9=tBhABDwCC|%5=e7mD$Q%Wxk4CC9D!viK`@4M8T0NvWlvrtC%Wx zm8Z&E<*V{n1*<|;;i^bgv?^8=uS!%U3r<$0t1?yDs$5mRnq4id7FCN2&QufC+G?_z zs-~-%YIn7#+FNj;+Fu>04pxV%!_|@MXmzYQUU0cOS)Hm*S7)lT)w$}tlr0q&T$PHY z5-B0oN=YdtrKOD2T`(>6N_|qlG$0L1L(;G`B8?WzN#oLlG$~C<)6$GIE6qvsoC29p zCX$I|5*Z=W%19X{qd7}tZkb2smHA|TSwI$)g=Aq_R2GxPWeHhQmXf7q8CjOIQkIvq zO{5M&VX?6kg6F3cn(t2r5E~up**}Dq@PbBB@9z(u#~C ztH>$xHS8K8XI+iBMp8r6XluwCs)nv%YTTS>YP>bR8h=foCRh`y3D-nwqMS`N@tQ#q&e25UpLVb067 z(b`yTyf#srtWDLXYcsW3&c52blC2afMM|+!q9l}BCCPbBNh=wpTj^1Hl|H3k8Bhi} z2bE!EL>X1alyPN3nN+5fX=PTKQ|47{l~5&8iB%F6!8xKLRg{WWF)Fvpqw=bJDnIAA zDyRyn!m5ZWs*0)Ns)Q=ZIjKskGODa9r^>6@YN1-B7IV(13AI*Dswp+CX4G!ANA2ZY zQ2W&Zbx<8rht&~vR2@^tIhWN*bxNI9XVh7BPMz1VHA2o+jaVbm5E`w9)KD5)!)V-` zX^mIo)A%(3O;8ikgf$UOlryJ^YZ98ICZ$PhGMcO=r^#!DT9H<)m1qgAR!eFrEzMn` zb!$CZuhysaYXjP#Hlz)6muaKgm^Q9WXp`EMHm%KQv)q;1ypF9C>O?xRPNE}pS{=!) z(9t?Z=hk_2UY$?p*9CMzu0j{qMIL`r7uCgdaa}@})TMN3u1=TL<#hQvcAcd9iMk}$QJ1dE)Me{(b@_UB zy|7+XFRmx*we@5@RZrJ5_3nC4y_fq)y}v$CAFL16hwCHt(fU|@ocm;bvOZOxuFuqG z>vQ$_26lsxyRJdpAZZ{Pv<+kf)j&5e4Q}o;4c-P{gTEor5NrrFgc~9aQSPROctfHg zxqbl+DKwKh4LLOA!5xBL2)zh;G4vAX3Fx)Zlh9Mp)8Gz6?}pw3y%%~P^nU0A z&0;O>Jy4+9$pAq*lI#4t!;AYjnKK*B)7 zz`)>!!2^RA1|JN57y>W^!955=7={Q8Q5a${#9>Ilkc1%xLmJ$-V93IdgCUPbHX4O! z6roX!MhO}TaF3voL?eYp8jTDZ-Dvcn(TheO8vWoNM`I9;AvA{37(rtcjWIOF(U?GE z65Nw$OrtS_#w;3hXv~9TgA{@kffR##1|$Je3z7s$fuun)Al)E6AidyT0O$UWHK%qXb3*MlFmaj1-JCj0}u! z7`-t1VD!TnfH4ST2*xms5g4Q3&cPUmF#%%|#uSWc7&9&!d^0a~jYrLbDjnlH8vcy+$;X)ACs~ z(`aUGX*mZGXt}G<|nf-d%=5T=8Hu>0CRAbUgX0tM_`W5 zzQ3sB;5~V}$^df;=Jf5rEX-M$b1>)8!bXb_ymk4<4x>ea79xM7A1x$WsJRsev@mFK zgZB(tylC;E#XmPaizb7;xm{cIMk zLbQs|Dn_dWtpr-N^NGV~rO-+*l=q|6jaJXX3kJ0M(CSBP0Ifl^hR_;DYZR?9w8qhz zKx-1MDYT~1nnCO0&F({M9uymt5R?d%7?cDQ0ZI#s1n)IaG$;m?87nBc_A5;KT z5WIt+!k{9cqM%}+;-C_slAuzc(%`)XDhnzHDh~@A79lJmSj4bMU?IRe0t*QX1q%%e z1B)9L4=i3-e6aYzI}S?_mJlpqSR$}QVTr*Kha~|^618G9NO||XQN#R-c_`V(Jnzdfp#t0NwiaFr_s)!-3{I}+P!G^q1})60NR6S z51~Db_6XXe;LV{uj`jrFlW0$&J&pDZ+OufSp*;_N0Xl@}5TQei4hcF4bZF5*qJu&Q z4gL~zxY6N3hZh|_bokK`Kt~WAA#{YnUxtn-I%4REqa%TiBsx;)NTVZzjx6{qdj~u6 zR<>1W64;tv;*Y8n6cW3TxOJu|};iYuuW!Cao!J+L~Sb z)6Co0Hla;q6Wb& zPubJ8Iw3a4mil%9XcJrU1y|j<^(*Zh2hv+aJp`-jwbevAm zNjgQR=?tBvb9CMzbch^chr~fRv<}iiIcWZ{!|m`mybhnk?+7@8j*uhFf7uar#2j%) z!jW{O9BD_!k>&4m5O*9I^&&*&SYn*GtGaiGuxT#%y+T7gk7R8ahIfv;2-HCyQnU@i|KNAdAht^zAiui zcvrA1)D`ZEbVa*jUGc6&SCW6SE8Ugp%68?t^4;uiVYjGT+)Z?AyUA{<)E@yCdDv?pSx6f4Mu^o$5|^XS%c9x$eA^?G*B_I>k;FIcHu_;1aq-F0o7EB3xP* z>7rb;V2R7^^0>S%pUdwGxPq>bD=b*%in?O1xGUjGx>ByRE91%vR=V;%>>go{s7Kr* z=^=WwJ!B8v!}Pd&JU!kXUyr{h&=V9Wdcr-Co@h_3C*G6jN%o|A(gIyiwkOwb literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..f5a1edbb10ee55f131764e18b638c35c8e06b0cf GIT binary patch literal 6565 zcmcI}eRLD|neHQ5Uls;CA_*YU;L0D8Mz%giwk3~b%O2U5JhCl$XcDS4b*gMPRM`fq zq_GiBaAXJ|vxQZfg^KQmD%%^b=uKFq-B1yOBbtVaZXlu!Ac}27K8Q%LkpUYS>~-7S zy}fsvJ@?#m`uk(v=ltI1eV_N8-^`pd=XJ7Hu~5Pn4HUWX2k@b20m#Kd{BcR{6HP8+ z?pb~+cx!*yP#Qb%(`eYB_*AGpce>QY5v#=FMjqz5Ur8<5Q6Tw}=$weTcX{CO?@6WC z-x)rzzT(}nYYd0Q|)KY48aZ$JO&?E>xtU$DOU1^W-Y^aYLw zzhM7?uYQr^_Y$sS^MfCKQ5`RRLFE6R)SpfE9}=!(dvE>j&(!f3kotWc3vK`Wga1qJ zFC_J+ZT}aEg*sk;?@#LZZ_nyaGyNxi=|9o`3I2=Jf4Kjwge&orAOG^->-c|s^1iZr zzAz^SL>w&VLQw!C4;(($2(VU&J`uJRqEL+ei*Tq2#~0)565LgcnG%lZF3xfZN3j$u z?&jF<;jForvvwJ$Z#llag0n};*}sx==swQzFF|=f=khAfOfDn(GIzNQx-zb@oNKS( zuBqg%m7`w4-KOO3A-MZh+(T7puI8Rqb1!SSGui@?uAr10)anPDY6e}kgOBo;t~U%0 z7zcON5ALG|5At6eF%O<<7#wdLoV4szfBa!d^X5`|v(~z~sb#ZE@cl>IHm|pB9md_KeQWWfZwdBz z1^b^69Qqr<@o(dQJ}J1oPB61xDEf|Yxex#Rl+gIJ(EeTFnrDP-d-3x=;kFIJJ$~W- zjlx6h_}kPcXP*;Z-XxqESn}laMOOnN#h}QzS!CZLTJr+l-zw_cCfc@Lv?nOqKZJ8T zM92SDboP6q%R5CgyKwRE3YUip6)zSVzh7v7sqoRi-@bnL_JKXycl}`dzTy=J_ii8g z;r3Jiuzma=w@(&_u`gKiqhM(ysC^~aR3df#IQZ!Q2(EuMIIusst0cTH8a#L)IP#O= zss9xmze_s#`VbBdmHhKiX>3S)*WRX|4!M3d^yt3~t$%Z9K(cbz&xiKKhYlVd8hLBz zlw|Mtk)g?>J20}NyH<0`&H4NWYPW;MTdUPY5v!u zv+osMK3Oz#YO&~jPJ3#x;`CzUZx-8syLin9obEG=`_3-jc5d;W^y2;JIbZ$V;^P+< zpB-I%d2I2_IOiJ|mn_dLQCwPL{BVi=BhI%zUb6O+C4HBdZ2NS{o-F4(SC$<5Y{~J@ zmz=%2 zzM|x{`2k>{jMt+zPE7QgBA z=xaBo-xwagIHX$hE@ zj=#>I(ZBr`zhu*9bJ;~r;mqp9k$WePayIKPzP@yG|!Ub0C0&O9$OTzmI>{olQG^!~o-cSkMbu3i0GKD5X-EIU4G z%h^JG_m&^6++g_Cs57_q`~5%us8LY(P;&I4+`E6@|Fe&dsyFmLAL*GA?&*K$V+*-q z_pe7EA73Bt|Lw;|>o&akucPaxw*Ij1ez}F(aOS1>^@*7Xj?YmLD zf7teRGn?Q0o4NfroYCQjvL_#${K^A!2XE~B$?)S>obLSj4{qOk&&k(@d#{}QdVcoI z+{n#h{ySSgb9!c1o}D{>(-|Ay{r2;JJ!?8Qck1TOpANtJxzjuOSMIs9H&6b{@SC5X zd~$Z<`P-L2ifp8M8R&))vA zPgd1mT>jqNhN-ca`{ysOYR)e^HDj;odF9zvpRQ`pKa`sL!S%6M?^v@IHvjT(8xP!A zcVKeQ6Bc)V>`dd~8xybJ8T`yFU%BdBGqiXR?msK$H&+G!|k~h z#kPxv({rzKpPL^kdhWAjM++9-vh_XMrSBKq;5@slr?2=TcV=YH)w6%uMEEgXL@@^@Jz$G-p4Pz`poe8-gOt=?KNX-?}bl# zhsSz1jSBl3F7|F6Dd`)&)Vp)kwxQvp-rb{*Z+MHd_2tp6{)SI`UmbnbKYXS4^^wwz z4WIYEIr_=Q;fda(W5WK1Ytz3P)AV1Mntp%G`RpS%CW}_Bd-lTg^u@8A&pmQ$`qQ!3 zadCF~+StiWkKCEQH8!#7!u(`OY4O0U|GG*C}D&qy0DwCC|%5-I>GFzFe%*(lQalsY2R4$Vfa#Bvo zX*nZj(bU{)TNC*(NkvMLR%8@e-bzJY$yJJ#5~Wlr zQxZy2N%6{*jFMHll^&&6=~McZ0cDV8pJye4M2H9z5h6;&h&YiTk~{~ICNe~p$PsxJS0z?S zR8rn*6`>+kl!{g{DpuuIc~oBB!z#ZjpbDx&s<0}eimGC&xGJeisnV*9Dyzz=@>Sd_ zG4F{gX_c&ss3NPVD!PiPVyoP|r>eYFzAArJpek4ustQ*{s-nD&Rq?7sRkA8om9ENE zWvg;kdEQ{PxLQ&zt(H|2)nqkQO;M-xc>S%SWI$oWq zPFAO?)76>kEN^dhUd>gD)e^N-EmIR}Qcdw*Q8Q{*?N)o#UbRo{R|nKV-T`%39Z^Ts zF?C#>P$$(Xby}TO=hS%(S0mO)G*XRBL+}o3C=IP)G_1z0@o2mnpT^HSrU`07ny@CK ziE3h+xF(@V@=j>dnv5o^$!YRhu2!s-Xr;W^b1bga(Jo6>o8KAm3| z&;@lNU04^QcJ2E~Crpa=JVzCMBell#v8Uk`zgk41W>nCOxE=^pSot zKnBSW8Rjn~qhySXlL<0OrpPpzA+!9IWM0qJi}ez{R4>yLdQwmE%k+$%)w}f`y;twk z`}F~Rkgw8*^^vbXrjP1l`nW!!PwG?pG+(dJ>T~*h4Yx*IBdL+r$Z80_sfMbdYnU3g z#$Ds7@z(fi{CsOouqIR!u8Gt{YhpF=nnX>K@2E-FWNNZCxte?}w^m#$sg>3ewPY<- zOV={BY^}T2Q|skFTX^5U6jAEE?$?YORk$oT?%z+)MZeY zMO_YcdGH5e5W^sWK?;Km1_A~W1_}lm1_t~g7~C*;VDQ4=gTW6&0EQq8AsE8ozX(GV zh8PTS7!oifVMxJ{h9Lt(7W}<1KsF$Ih0RJ%R zDb&-bXHd_g-i>+>>b!Fp)4(FwrnEFtISXVe-P{gUJt50Hz>J zA(+B2MPQ18KMPYFrUXn$m{KsMVamXig((M99)bdx#V|`?mclH9nShyunSz;ynSo#t z%x;)HFneM4!R&`Q0CN!L5X@l+mckr`IRm_Y-92694n2!b*+Flb=W;6{T74c>`6eQ5BbAu#!{5dsw&!jp#<_)#>(&=5yM z0u4ztq^`ZV5UWQ+_S&@tejbh7oYRO#2^ytnl;wWC5NkpsHKm+EBZEeEYU?31deG=a zqYsUKGzPDG`p_6gW90hCLSPJy@f%-SASWSkpfQcc3>vd&%-xtd1Pd1y@y!SOV39(w z`sPoKu#m7&u+XqDu&}VWr@uD?ix+~2r$1ZZ2Ve=#Fbn)JED>0uGw&|oI0TQ~sxZQm zf+c+`Fat{#mK-d3G;z@+hTw_(BZtr=Llcod+=nI#P4w&vBbrz=xgmH8Og$JL9-OiGBgut zCg&1|&`hJ5nJ?`_vm4Ex`R9yi_MzF2<^Y<5Xbzz{jOHksV`z?}If3RRnp0>_qd9}- zg_GTj<~(RFXfbFBXenqJXaY0|nu6dJ&UVsv=_7wv>$W;bP$3Apu?aeprfE; zpyQwupp&3epwke%0Xhph2RaWc7gjN>5?H0M%3viRI1DQVD-A0HD+{X|Ru8ORSbebi zAvgwW5Y`Z^VOS%uMq!P?8izFjYZ8JJu%=|JqcwuoCbVyxosYs*XFbNZ2?v-^csd(a-ThwTx2)E=|P?FoBQ=&+~l8GF{Av*+8n?c#Px zyHvQkooFZ9sdl=ZX=mHr?Vfh8@ZolUd!Rko9%>J_N7|$9vG%y|vG!zpsy*GFY0tLj z+Vc#T5euJSq>PLq7?PnFnqe50aSNYfyo`_WGXW;ZgqScBVWPr~Oq@wDNhZalnGBO< za!lSKc1Rpjhs;4ZNC)Mh9gJ|u;dXc&UWd=&cLW?kN5~NtzUYWLVve{Y;Yd1Cj-)6_(})U!FISiJRRN+Ux&XV&=C|K=m>X2I-(u1j(A6+ zBiWJaNDJTS$ads9@}1mHai^qH+9~TKgoit+PP&unWINrRo=$J4uhTC))*0*!b%r}5 zozc!%XS_4fnG~MrOm}8Fvz@uld>6M%+$HIfb`f1<7u7{~FC=dIcX>3WSwr|l+)|< zIsMLnGw2LC!_J5^Dx7u3oe5{snR2F`8E4j+bLK?_F0o7ElDcFr!bQ3$7wuw1i(GD( z$K`eTTz*%;6?BDMVbM}o)D?5ZT?tpxm2#zB8CO=c(v|P#c8j|u-O_GZH_=UYQ{7BA z+wJc5bbGsf-Tv-CcTlA24tGboqusIYcz2>Z*`4Z6i}c;u?p$}C<+5T{!b({gONdM? z#nLRpvaFl+uwK^3`bAbY$cET38)2htjE%DiHYswjX*R=V*&Lf+&0Q^CEmvNHvBkoO;}25{Q9;SHPFl2J)-L~zw$!Bf@NU@(QIUkhkrsV>L(kLS`t~=U zdiopRUSA3Yw2+BJg?~ia5AUY_n4kJ@^EV9?**P9Qsn)9XqQ6<_Wnh3O Date: Mon, 13 Jan 2025 08:47:23 -0500 Subject: [PATCH 06/12] Add seed parameter to hash_character_ngrams (#17643) Adds a seed parameter to the `nvtext::hash_character_ngrams` API. Makes this more useful in conjunction with other nvtext APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17643 --- cpp/include/nvtext/detail/generate_ngrams.hpp | 39 ------------------- cpp/include/nvtext/generate_ngrams.hpp | 4 +- cpp/src/text/generate_ngrams.cu | 13 ++++--- cpp/tests/streams/text/ngrams_test.cpp | 4 +- cpp/tests/text/ngrams_tests.cpp | 13 ++++++- 5 files changed, 25 insertions(+), 48 deletions(-) delete mode 100644 cpp/include/nvtext/detail/generate_ngrams.hpp diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp deleted file mode 100644 index ae48fed4e79..00000000000 --- a/cpp/include/nvtext/detail/generate_ngrams.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -namespace CUDF_EXPORT nvtext { -namespace detail { - -/** - * @copydoc hash_character_ngrams(cudf::strings_column_view const&, - * cudf::size_type, rmm::device_async_resource_ref) - * - * @param stream CUDA stream used for allocating/copying device memory and launching kernels - */ -std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, - cudf::size_type ngrams, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace detail -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 54282b8ef3c..b2ba1798a8f 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ std::unique_ptr generate_character_ngrams( * * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param seed The seed value to use with the hash algorithm. Default is 0. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values @@ -124,6 +125,7 @@ std::unique_ptr generate_character_ngrams( std::unique_ptr hash_character_ngrams( cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + uint32_t seed = 0, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 997b0278fe2..33d52ccd570 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include @@ -315,6 +315,7 @@ namespace { */ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + uint32_t seed, cudf::size_type const* d_ngram_offsets, cudf::hash_value_type* d_results) { @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st __shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values auto const ngram_offset = d_ngram_offsets[str_idx]; - auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{seed}; auto const end = d_str.data() + d_str.size_bytes(); auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -400,7 +402,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto d_hashes = hashes->mutable_view().data(); character_ngram_hash_kernel<<>>( - *d_strings, ngrams, d_offsets, d_hashes); + *d_strings, ngrams, seed, d_offsets, d_hashes); return make_lists_column( input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr); @@ -419,11 +421,12 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, stream, mr); + return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp index 221c0a62f3e..47b9ac46d12 100644 --- a/cpp/tests/streams/text/ngrams_test.cpp +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams) auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); nvtext::hash_character_ngrams( - cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); + cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream()); } TEST_F(TextNGramsTest, NgramsTokenize) diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c72c7cfc80e..1a737231389 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash) 2319357747u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::hash_character_ngrams(view, 10, 10); + // clang-format off + LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u, + 2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u}, + LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u, + 3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u, + 4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(TextGenerateNgramsTest, NgramsHashErrors) From 4ec389b4c515e4b3b85d6fd28b1471b1f1de830d Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:27:13 -0800 Subject: [PATCH 07/12] Implement `HOST_UDF` aggregation for reduction and segmented reduction (#17645) Following https://github.com/rapidsai/cudf/pull/17592, this enables `HOST_UDF` aggregation in reduction and segmented reduction, allowing to execute a host-side user-defined function (UDF) through libcudf aggregation framework. Closes https://github.com/rapidsai/cudf/issues/16633. Authors: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Yunsong Wang (https://github.com/PointKernel) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/17645 --- cpp/include/cudf/aggregation.hpp | 4 +- cpp/include/cudf/aggregation/host_udf.hpp | 478 +++++++++++------- .../cudf/detail/aggregation/aggregation.hpp | 6 +- cpp/src/groupby/groupby.cu | 9 +- cpp/src/groupby/sort/aggregate.cpp | 81 ++- cpp/src/groupby/sort/host_udf_aggregation.cpp | 48 +- cpp/src/reductions/reductions.cpp | 16 +- cpp/src/reductions/segmented/reductions.cpp | 17 +- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/groupby/host_udf_example_tests.cu | 75 +-- cpp/tests/groupby/host_udf_tests.cpp | 245 ++++----- .../reductions/host_udf_example_tests.cu | 422 ++++++++++++++++ .../main/java/ai/rapids/cudf/Aggregation.java | 2 +- .../ai/rapids/cudf/GroupByAggregation.java | 2 +- .../ai/rapids/cudf/ReductionAggregation.java | 17 +- .../cudf/SegmentedReductionAggregation.java | 11 +- 16 files changed, 941 insertions(+), 495 deletions(-) create mode 100644 cpp/tests/reductions/host_udf_example_tests.cu diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a1b7db5e08a..2b2a660bed7 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -601,7 +601,7 @@ std::unique_ptr make_udf_aggregation(udf_type type, data_type output_type); // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation. -struct host_udf_base; +class host_udf_base; /** * @brief Factory to create a HOST_UDF aggregation. diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp index bbce76dc5f3..451d75137e4 100644 --- a/cpp/include/cudf/aggregation/host_udf.hpp +++ b/cpp/include/cudf/aggregation/host_udf.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,18 +17,16 @@ #pragma once #include +#include #include #include #include -#include #include #include +#include #include -#include -#include -#include /** * @file host_udf.hpp @@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf { */ /** - * @brief The interface for host-based UDF implementation. + * @brief The fundamental interface for host-based UDF implementation. * - * An implementation of host-based UDF needs to be derived from this base class, defining - * its own version of the required functions. In particular: - * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`, - * and `clone` functions. - * - If necessary, the derived class can also override `do_hash` to compute hashing for its - * instance, and `get_required_data` to selectively access to the input data as well as - * intermediate data provided by libcudf. + * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in + * the users' UDF implementation. These functions are required for libcudf aggregation framework + * to perform its operations. + */ +class host_udf_base { + // Declare constructor private to prevent the users from deriving from this class. + private: + host_udf_base() = default; ///< Default constructor + + // Only allow deriving from the structs below. + friend struct reduce_host_udf; + friend struct segmented_reduce_host_udf; + friend struct groupby_host_udf; + + public: + virtual ~host_udf_base() = default; ///< Default destructor + + /** + * @brief Computes hash value of the instance. + * + * Overriding this function is optional when the derived class has data members such that + * each instance needs to be differentiated from each other. + * + * @return The hash value of the instance + */ + [[nodiscard]] virtual std::size_t do_hash() const + { + return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + } + + /** + * @brief Compares two instances of the derived class for equality. + * @param other The other instance to compare with + * @return True if the two instances are equal + */ + [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + + /** + * @brief Clones the instance. + * + * The instances of the derived class should be lightweight for efficient cloning. + * + * @return A new instance cloned from this one + */ + [[nodiscard]] virtual std::unique_ptr clone() const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for reduction contexts. + * + * An implementation of host-based UDF for reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived classes must also define the `operator()` function to perform reduction + * operations. * - * Example of such implementation: + * Example: * @code{.cpp} - * struct my_udf_aggregation : cudf::host_udf_base { + * struct my_udf_aggregation : cudf::reduce_host_udf { * my_udf_aggregation() = default; * - * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, - * // and the result from groupby `MAX` aggregation. - * [[nodiscard]] data_attribute_set_t get_required_data() const override + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * data_type output_dtype, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override * { - * return {groupby_data_attribute::GROUPED_VALUES, - * groupby_data_attribute::GROUP_OFFSETS, - * cudf::make_max_aggregation()}; + * // Perform reduction computation using the input data and return the reduction result. + * // This is where the actual reduction logic is implemented. * } * - * [[nodiscard]] output_t get_empty_output( - * [[maybe_unused]] std::optional output_dtype, - * [[maybe_unused]] rmm::cuda_stream_view stream, - * [[maybe_unused]] rmm::device_async_resource_ref mr) const override + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { - * // This UDF aggregation always returns a column of type INT32. - * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; * } * - * [[nodiscard]] output_t operator()(input_map_t const& input, - * rmm::cuda_stream_view stream, - * rmm::device_async_resource_ref mr) const override + * [[nodiscard]] std::unique_ptr clone() const override * { - * // Perform UDF computation using the input data and return the result. + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct reduce_host_udf : host_udf_base { + /** + * @brief Perform reduction operations. + * + * @param input The input column for reduction + * @param output_dtype The data type for the final output scalar + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation + */ + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for segmented reduction context. + * + * An implementation of host-based UDF for segmented reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived class must also define the `operator()` function to perform segmented reduction. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::segmented_reduce_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * device_span offsets, + * data_type output_dtype, + * null_policy null_handling, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform computation using the input data and return the result. + * // This is where the actual segmented reduction logic is implemented. * } * * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. * return dynamic_cast(&other) != nullptr; * } * @@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf { * }; * @endcode */ -struct host_udf_base { - host_udf_base() = default; - virtual ~host_udf_base() = default; - +struct segmented_reduce_host_udf : host_udf_base { /** - * @brief Define the possible data needed for groupby aggregations. + * @brief Perform segmented reduction operations. * - * Note that only sort-based groupby aggregations are supported. + * @param input The input column for reduction + * @param offsets A list of offsets defining the segments for reduction + * @param output_dtype The data type for the final output column + * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in + * the segment are valid, and if `EXCLUDE` then the reduction result is valid if any + * element in the segment is valid + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - enum class groupby_data_attribute : int32_t { - INPUT_VALUES, ///< The input values column. - GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the - ///< values within each group maintain their original order. - SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and - ///< sorted within each group. - NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys). - GROUP_OFFSETS, ///< The offsets separating groups. - GROUP_LABELS ///< Group labels (which is also the same as group indices). - }; + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + device_span offsets, + data_type output_dtype, + null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; +// Forward declaration. +namespace groupby ::detail { +struct aggregate_result_functor; +} + +/** + * @brief The interface for host-based UDF implementation for groupby aggregation context. + * + * An implementation of host-based UDF for groupby needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such a derived class must also define the functions `get_empty_output()` to return result when + * the input is empty, and ``operator()`` to perform its groupby operations. + * + * During execution, the derived class can access internal data provided by the libcudf groupby + * framework through a set of ``get*`` accessors, as well as calling other built-in groupby + * aggregations through the ``compute_aggregation`` function. + * + * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby + * aggregations require more complex data structure and is not yet supported. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::groupby_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr get_empty_output( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Return a column corresponding to the result when the input values column is empty. + * } + * + * [[nodiscard]] std::unique_ptr operator()( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform UDF computation using the input data and return the result. + * } + * + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override + * { + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; + * } + * + * [[nodiscard]] std::unique_ptr clone() const override + * { + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct groupby_host_udf : host_udf_base { /** - * @brief Describe possible data that may be needed in the derived class for its operations. + * @brief Get the output when the input values column is empty. * - * Such data can be either intermediate data such as sorted values or group labels etc, or the - * results of other aggregations. + * This is called in libcudf when the input values column is empty. In such situations libcudf + * tries to generate the output directly without unnecessarily evaluating the intermediate data. * - * Each derived host-based UDF class may need a different set of data. It is inefficient to - * evaluate and pass down all these possible data at once from libcudf. A solution for that is, - * the derived class can define a subset of data that it needs and libcudf will evaluate - * and pass down only data requested from that set. + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation when the input values column is empty */ - struct data_attribute { - /** - * @brief Hold all possible data types for the input of the aggregation in the derived class. - */ - using value_type = std::variant>; - value_type value; ///< The actual data attribute, wrapped by this struct - ///< as a wrapper is needed to define `hash` and `equal_to` functors. - - data_attribute() = default; ///< Default constructor - data_attribute(data_attribute&&) = default; ///< Move constructor - - /** - * @brief Construct a new data attribute from an aggregation attribute. - * @param value_ An aggregation attribute - */ - template )> - data_attribute(T value_) : value{value_} - { - } - - /** - * @brief Construct a new data attribute from another aggregation request. - * @param value_ An aggregation request - */ - template || - std::is_same_v)> - data_attribute(std::unique_ptr value_) : value{std::move(value_)} - { - CUDF_EXPECTS(std::get>(value) != nullptr, - "Invalid aggregation request."); - if constexpr (std::is_same_v) { - CUDF_EXPECTS( - dynamic_cast(std::get>(value).get()) != nullptr, - "Requesting results from other aggregations is only supported in groupby " - "aggregations."); - } - } - - /** - * @brief Copy constructor. - * @param other The other data attribute to copy from - */ - data_attribute(data_attribute const& other); - - /** - * @brief Hash functor for `data_attribute`. - */ - struct hash { - /** - * @brief Compute the hash value of a data attribute. - * @param attr The data attribute to hash - * @return The hash value of the data attribute - */ - std::size_t operator()(data_attribute const& attr) const; - }; // struct hash - - /** - * @brief Equality comparison functor for `data_attribute`. - */ - struct equal_to { - /** - * @brief Check if two data attributes are equal. - * @param lhs The left-hand side data attribute - * @param rhs The right-hand side data attribute - * @return True if the two data attributes are equal - */ - bool operator()(data_attribute const& lhs, data_attribute const& rhs) const; - }; // struct equal_to - }; // struct data_attribute + [[nodiscard]] virtual std::unique_ptr get_empty_output( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; /** - * @brief Set of attributes for the input data that is needed for computing the aggregation. + * @brief Perform the main groupby computation for the host-based UDF. + * + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - using data_attribute_set_t = - std::unordered_set; + [[nodiscard]] virtual std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; + + private: + // Allow the struct `aggregate_result_functor` to set its private callback variables. + friend struct groupby::detail::aggregate_result_functor; /** - * @brief Return a set of attributes for the data that is needed for computing the aggregation. - * - * The derived class should return the attributes corresponding to only the data that it needs to - * avoid unnecessary computation performed in libcudf. If this function is not overridden, an - * empty set is returned. That means all the data attributes (except results from other - * aggregations in groupby) will be needed. - * - * @return A set of `data_attribute` + * @brief Callback to access the input values column. + */ + std::function callback_input_values; + + /** + * @brief Callback to access the input values grouped according to the input keys for which the + * values within each group maintain their original order. + */ + std::function callback_grouped_values; + + /** + * @brief Callback to access the input values grouped according to the input keys and sorted + * within each group. + */ + std::function callback_sorted_grouped_values; + + /** + * @brief Callback to access the number of groups (i.e., number of distinct keys). */ - [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; } + std::function callback_num_groups; /** - * @brief Hold all possible types of the data that is passed to the derived class for executing - * the aggregation. + * @brief Callback to access the offsets separating groups. */ - using input_data_t = std::variant>; + std::function(void)> callback_group_offsets; /** - * @brief Input to the aggregation, mapping from each data attribute to its actual data. + * @brief Callback to access the group labels (which is also the same as group indices). */ - using input_map_t = std:: - unordered_map; + std::function(void)> callback_group_labels; /** - * @brief Output type of the aggregation. + * @brief Callback to access the result from other groupby aggregations. + */ + std::function)> callback_compute_aggregation; + + protected: + /** + * @brief Access the input values column. * - * Currently only a single type is supported as the output of the aggregation, but it will hold - * more type in the future when reduction is supported. + * @return The input values column. */ - using output_t = std::variant>; + [[nodiscard]] column_view get_input_values() const + { + CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values."); + return callback_input_values(); + } /** - * @brief Get the output when the input values column is empty. + * @brief Access the input values grouped according to the input keys for which the values + * within each group maintain their original order. * - * This is called in libcudf when the input values column is empty. In such situations libcudf - * tries to generate the output directly without unnecessarily evaluating the intermediate data. + * @return The grouped values column. + */ + [[nodiscard]] column_view get_grouped_values() const + { + CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values."); + return callback_grouped_values(); + } + + /** + * @brief Access the input values grouped according to the input keys and sorted within each + * group. * - * @param output_dtype The expected output data type - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation when input values is empty + * @return The sorted grouped values column. */ - [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] column_view get_sorted_grouped_values() const + { + CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values."); + return callback_sorted_grouped_values(); + } /** - * @brief Perform the main computation for the host-based UDF. + * @brief Access the number of groups (i.e., number of distinct keys). * - * @param input The input data needed for performing all computation - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation + * @return The number of groups. */ - [[nodiscard]] virtual output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] size_type get_num_groups() const + { + CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups."); + return callback_num_groups(); + } /** - * @brief Computes hash value of the class's instance. - * @return The hash value of the instance + * @brief Access the offsets separating groups. + * + * @return The array of group offsets. */ - [[nodiscard]] virtual std::size_t do_hash() const + [[nodiscard]] device_span get_group_offsets() const { - return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets."); + return callback_group_offsets(); } /** - * @brief Compares two instances of the derived class for equality. - * @param other The other derived class's instance to compare with - * @return True if the two instances are equal + * @brief Access the group labels (which is also the same as group indices). + * + * @return The array of group labels. */ - [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + [[nodiscard]] device_span get_group_labels() const + { + CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels."); + return callback_group_labels(); + } /** - * @brief Clones the instance. + * @brief Compute a built-in groupby aggregation and access its result. * - * A class derived from `host_udf_base` should not store too much data such that its instances - * remain lightweight for efficient cloning. + * This allows the derived class to call any other built-in groupby aggregations on the same input + * values column and access the output for its operations. * - * @return A new instance cloned from this + * @param other_agg An arbitrary built-in groupby aggregation + * @return A `column_view` object corresponding to the output result of the given aggregation */ - [[nodiscard]] virtual std::unique_ptr clone() const = 0; + [[nodiscard]] column_view compute_aggregation(std::unique_ptr other_agg) const + { + CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation."); + return callback_compute_aggregation(std::move(other_agg)); + } }; /** @} */ // end of group diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index d873e93bd20..5574ed6ea6e 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying host-based UDF aggregation. */ -class host_udf_aggregation final : public groupby_aggregation { +class host_udf_aggregation final : public groupby_aggregation, + public reduce_aggregation, + public segmented_reduce_aggregation { public: std::unique_ptr udf_ptr; diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 4c90cd0eef5..6234148e9fa 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,8 +145,11 @@ struct empty_column_constructor { } if constexpr (k == aggregation::Kind::HOST_UDF) { - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - return std::get>(udf_ptr->get_empty_output(std::nullopt, stream, mr)); + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); + return udf_ptr->get_empty_output(stream, mr); } return make_empty_column(target_type(values.type(), k)); diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 6480070e85a..fb3f7559d64 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -795,58 +795,41 @@ void aggregate_result_functor::operator()(aggregation con { if (cache.has_result(values, agg)) { return; } - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t { - if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; } - // Empty attribute set means everything. - return {host_udf_base::groupby_data_attribute::INPUT_VALUES, - host_udf_base::groupby_data_attribute::GROUPED_VALUES, - host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - host_udf_base::groupby_data_attribute::NUM_GROUPS, - host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - host_udf_base::groupby_data_attribute::GROUP_LABELS}; - }(); + auto const& udf_base_ptr = dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); - // Do not cache udf_input, as the actual input data may change from run to run. - host_udf_base::input_map_t udf_input; - for (auto const& attr : data_attrs) { - CUDF_EXPECTS(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value), - "Invalid input data attribute for HOST_UDF groupby aggregation."); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case host_udf_base::groupby_data_attribute::INPUT_VALUES: - udf_input.emplace(attr, values); - break; - case host_udf_base::groupby_data_attribute::GROUPED_VALUES: - udf_input.emplace(attr, get_grouped_values()); - break; - case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES: - udf_input.emplace(attr, get_sorted_values()); - break; - case host_udf_base::groupby_data_attribute::NUM_GROUPS: - udf_input.emplace(attr, helper.num_groups(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_OFFSETS: - udf_input.emplace(attr, helper.group_offsets(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_LABELS: - udf_input.emplace(attr, helper.group_labels(stream)); - break; - default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation."); - } - } else { // data is result from another aggregation - auto other_agg = std::get>(attr.value)->clone(); + if (!udf_ptr->callback_input_values) { + udf_ptr->callback_input_values = [&]() -> column_view { return values; }; + } + if (!udf_ptr->callback_grouped_values) { + udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); }; + } + if (!udf_ptr->callback_sorted_grouped_values) { + udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); }; + } + if (!udf_ptr->callback_num_groups) { + udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); }; + } + if (!udf_ptr->callback_group_offsets) { + udf_ptr->callback_group_offsets = [&]() -> device_span { + return helper.group_offsets(stream); + }; + } + if (!udf_ptr->callback_group_labels) { + udf_ptr->callback_group_labels = [&]() -> device_span { + return helper.group_labels(stream); + }; + } + if (!udf_ptr->callback_compute_aggregation) { + udf_ptr->callback_compute_aggregation = + [&](std::unique_ptr other_agg) -> column_view { cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg); - auto result = cache.get_result(values, *other_agg); - udf_input.emplace(std::move(other_agg), std::move(result)); - } + return cache.get_result(values, *other_agg); + }; } - auto output = (*udf_ptr)(udf_input, stream, mr); - CUDF_EXPECTS(std::holds_alternative>(output), - "Invalid output type from HOST_UDF groupby aggregation."); - cache.add_result(values, agg, std::get>(std::move(output))); + cache.add_result(values, agg, (*udf_ptr)(stream, mr)); } } // namespace detail diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp index 0da47e17f48..6f1fe80c4bd 100644 --- a/cpp/src/groupby/sort/host_udf_aggregation.cpp +++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,51 +16,9 @@ #include #include -#include namespace cudf { -host_udf_base::data_attribute::data_attribute(data_attribute const& other) - : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; }, - [](std::unique_ptr const& val) { - return value_type{val->clone()}; - }}, - other.value)} -{ -} - -std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const -{ - auto const hash_value = - std::visit(cudf::detail::visitor_overload{ - [](auto const& val) { return std::hash{}(static_cast(val)); }, - [](std::unique_ptr const& val) { return val->do_hash(); }}, - attr.value); - return std::hash{}(attr.value.index()) ^ hash_value; -} - -bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs, - data_attribute const& rhs) const -{ - auto const& lhs_val = lhs.value; - auto const& rhs_val = rhs.value; - if (lhs_val.index() != rhs_val.index()) { return false; } - return std::visit( - cudf::detail::visitor_overload{ - [](auto const& lhs_val, auto const& rhs_val) { - if constexpr (std::is_same_v) { - return lhs_val == rhs_val; - } else { - return false; - } - }, - [](std::unique_ptr const& lhs_val, std::unique_ptr const& rhs_val) { - return lhs_val->is_equal(*rhs_val); - }}, - lhs_val, - rhs_val); -} - namespace detail { host_udf_aggregation::host_udf_aggregation(std::unique_ptr udf_ptr_) @@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation); template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); } // namespace cudf diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 75ebc078930..928625a7e8f 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -144,6 +145,13 @@ struct reduce_dispatch_functor { auto td_agg = static_cast(agg); return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr); } + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction."); + return (*udf_ptr)(col, output_dtype, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported reduction operator"); } } @@ -161,9 +169,11 @@ std::unique_ptr reduce(column_view const& col, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } // Returns default scalar if input column is empty or all null diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 1c3a2b0c0f3..5835bfcf0a1 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include #include #include #include @@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor { } case segmented_reduce_aggregation::NUNIQUE: return segmented_nunique(col, offsets, null_handling, stream, mr); + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction."); + return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported aggregation type."); } } @@ -117,9 +126,11 @@ std::unique_ptr segmented_reduce(column_view const& segmented_values, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } if (segmented_values.is_empty() && offsets.empty()) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 344979e1288..35877ac34b9 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -220,11 +220,12 @@ ConfigureTest( REDUCTIONS_TEST reductions/collect_ops_tests.cpp reductions/ewm_tests.cpp + reductions/host_udf_example_tests.cu + reductions/list_rank_test.cpp reductions/rank_tests.cpp reductions/reduction_tests.cpp reductions/scan_tests.cpp reductions/segmented_reduction_tests.cpp - reductions/list_rank_test.cpp reductions/tdigest_tests.cu GPUS 1 PERCENT 70 diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu index a454bd692fc..e1ded37d8a7 100644 --- a/cpp/tests/groupby/host_udf_example_tests.cu +++ b/cpp/tests/groupby/host_udf_example_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,7 @@ #include #include #include -#include #include -#include #include #include @@ -34,6 +32,9 @@ #include #include +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; + namespace { /** * @brief A host-based UDF implementation for groupby. @@ -41,42 +42,21 @@ namespace { * For each group of values, the aggregation computes * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`. */ -struct host_udf_groupby_example : cudf::host_udf_base { +struct host_udf_groupby_example : cudf::groupby_host_udf { host_udf_groupby_example() = default; - [[nodiscard]] data_attribute_set_t get_required_data() const override - { - // We need grouped values, group offsets, group labels, and also results from groups' - // MAX and SUM aggregations. - return {groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS, - cudf::make_max_aggregation(), - cudf::make_sum_aggregation()}; - } - - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, - [[maybe_unused]] rmm::cuda_stream_view stream, - [[maybe_unused]] rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr get_empty_output( + rmm::cuda_stream_view, rmm::device_async_resource_ref) const override { return cudf::make_empty_column( cudf::data_type{cudf::type_to_id()}); } - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr); - } - - [[nodiscard]] std::size_t do_hash() const override - { - // Just return the same hash for all instances of this class. - return std::size_t{12345}; + auto const values = get_grouped_values(); + return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr); } [[nodiscard]] bool is_equal(host_udf_base const& other) const override @@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base { struct groupby_fn { // Store pointer to the parent class so we can call its functions. - host_udf_groupby_example const* parent; + host_udf_groupby_example const& parent; - // For simplicity, this example only accepts double input and always produces double output. + // For simplicity, this example only accepts a single type input and output. using InputType = double; using OutputType = double; template )> - output_t operator()(Args...) const + std::unique_ptr operator()(Args...) const { CUDF_FAIL("Unsupported input type."); } template )> - output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + std::unique_ptr operator()(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); } + auto const values = parent.get_grouped_values(); + if (values.size() == 0) { return parent.get_empty_output(stream, mr); } - auto const offsets = std::get>( - input.at(groupby_data_attribute::GROUP_OFFSETS)); + auto const offsets = parent.get_group_offsets(); CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); auto const num_groups = static_cast(offsets.size()) - 1; - auto const group_indices = std::get>( - input.at(groupby_data_attribute::GROUP_LABELS)); - auto const group_max = std::get( - input.at(cudf::make_max_aggregation())); - auto const group_sum = std::get( - input.at(cudf::make_sum_aggregation())); + auto const group_indices = parent.get_group_labels(); + auto const group_max = + parent.compute_aggregation(cudf::make_max_aggregation()); + auto const group_sum = + parent.compute_aggregation(cudf::make_sum_aggregation()); auto const values_dv_ptr = cudf::column_device_view::create(values, stream); auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, @@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base { } // namespace -using doubles_col = cudf::test::fixed_width_column_wrapper; -using int32s_col = cudf::test::fixed_width_column_wrapper; - struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {}; TEST_F(HostUDFGroupbyExampleTest, SimpleInput) diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp index 1a0f68c0c6c..17da28cdefc 100644 --- a/cpp/tests/groupby/host_udf_tests.cpp +++ b/cpp/tests/groupby/host_udf_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,178 +26,121 @@ #include namespace { + /** - * @brief A host-based UDF implementation used for unit tests. + * @brief Generate a random aggregation object from {min, max, sum, product}. */ -struct host_udf_test_base : cudf::host_udf_base { +std::unique_ptr get_random_agg() +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distr(1, 4); + switch (distr(gen)) { + case 1: return cudf::make_min_aggregation(); + case 2: return cudf::make_max_aggregation(); + case 3: return cudf::make_sum_aggregation(); + case 4: return cudf::make_product_aggregation(); + default: CUDF_UNREACHABLE("This should not be reached."); + } + return nullptr; +} + +/** + * @brief A host-based UDF implementation used for unit tests for groupby aggregation. + */ +struct host_udf_groupby_test : cudf::groupby_host_udf { int test_location_line; // the location where testing is called bool* test_run; // to check if the test is accidentally skipped - data_attribute_set_t input_attrs; + bool test_other_agg; // test calling other aggregation - host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_) - : test_location_line{test_location_line_}, - test_run{test_run_}, - input_attrs(std::move(input_attrs_)) + host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_) + : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_} { } - [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; } - - // This is the main testing function, which checks for the correctness of input data. - // The rests are just to satisfy the interface. - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::size_t do_hash() const override { return 0; } + [[nodiscard]] bool is_equal(host_udf_base const& other) const override { - SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - - test_data_attributes(input, stream, mr); - - *test_run = true; // test is run successfully - return get_empty_output(std::nullopt, stream, mr); + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(test_location_line, test_run, test_other_agg); } - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, + [[nodiscard]] std::unique_ptr get_empty_output( [[maybe_unused]] rmm::cuda_stream_view stream, [[maybe_unused]] rmm::device_async_resource_ref mr) const override { - // Unused function - dummy output. + // Dummy output. return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); } - [[nodiscard]] std::size_t do_hash() const override { return 0; } - [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; } + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override + { + SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - // The main test function, which must be implemented for each kind of aggregations - // (groupby/reduction/segmented_reduction). - virtual void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; -}; + // Perform tests on types for the groupby data: we must ensure the data corresponding to each + // `groupby_data` enum having the correct type. -/** - * @brief A host-based UDF implementation used for unit tests for groupby aggregation. - */ -struct host_udf_groupby_test : host_udf_test_base { - host_udf_groupby_test(int test_location_line_, - bool* test_run_, - data_attribute_set_t input_attrs_ = {}) - : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_)) - { - } + { + auto const inp_data = get_input_values(); + EXPECT_TRUE((std::is_same_v>)); + } - [[nodiscard]] std::unique_ptr clone() const override - { - return std::make_unique(test_location_line, test_run, input_attrs); - } + { + auto const inp_data = get_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); + } - void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override - { - data_attribute_set_t check_attrs = input_attrs; - if (check_attrs.empty()) { - check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES, - groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::SORTED_GROUPED_VALUES, - groupby_data_attribute::NUM_GROUPS, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS}; + { + auto const inp_data = get_sorted_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); } - EXPECT_EQ(input.size(), check_attrs.size()); - for (auto const& attr : check_attrs) { - EXPECT_TRUE(input.count(attr) > 0); - EXPECT_TRUE(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value)); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case groupby_data_attribute::INPUT_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::SORTED_GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::NUM_GROUPS: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUP_OFFSETS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - case groupby_data_attribute::GROUP_LABELS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - default:; - } - } else { // std::holds_alternative>(attr.value) - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - } + + { + auto const inp_data = get_num_groups(); + EXPECT_TRUE((std::is_same_v>)); } - } -}; -/** - * @brief Get a random subset of input data attributes. - */ -cudf::host_udf_base::data_attribute_set_t get_subset( - cudf::host_udf_base::data_attribute_set_t const& attrs) -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution size_distr(1, attrs.size() - 1); - auto const subset_size = size_distr(gen); - auto const elements = - std::vector(attrs.begin(), attrs.end()); - std::uniform_int_distribution idx_distr(0, attrs.size() - 1); - cudf::host_udf_base::data_attribute_set_t output; - while (output.size() < subset_size) { - output.insert(elements[idx_distr(gen)]); - } - return output; -} + { + auto const inp_data = get_group_offsets(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } -/** - * @brief Generate a random aggregation object from {min, max, sum, product}. - */ -std::unique_ptr get_random_agg() -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution distr(1, 4); - switch (distr(gen)) { - case 1: return cudf::make_min_aggregation(); - case 2: return cudf::make_max_aggregation(); - case 3: return cudf::make_sum_aggregation(); - case 4: return cudf::make_product_aggregation(); - default: CUDF_UNREACHABLE("This should not be reached."); + { + auto const inp_data = get_group_labels(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } + + // Perform tests on type of the result from computing other aggregations. + if (test_other_agg) { + auto const inp_data = compute_aggregation(get_random_agg()); + EXPECT_TRUE((std::is_same_v>)); + } + + *test_run = true; // test is run successfully + return get_empty_output(stream, mr); } - return nullptr; -} +}; } // namespace using int32s_col = cudf::test::fixed_width_column_wrapper; -// Number of randomly testing on the input data attributes. -// For each test, a subset of data attributes will be randomly generated from all the possible input -// data attributes. The input data corresponding to that subset passed from libcudf will be tested -// for correctness. -constexpr int NUM_RANDOM_TESTS = 20; - struct HostUDFTest : cudf::test::BaseFixture {}; -TEST_F(HostUDFTest, GroupbyAllInput) +TEST_F(HostUDFTest, GroupbyBuiltinInput) { bool test_run = false; auto const keys = int32s_col{0, 1, 2}; auto const vals = int32s_col{0, 1, 2}; auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run)); + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ false)); std::vector requests; requests.emplace_back(); @@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } -TEST_F(HostUDFTest, GroupbySomeInput) +TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations) { - auto const keys = int32s_col{0, 1, 2}; - auto const vals = int32s_col{0, 1, 2}; - auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{ - cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES, - cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS, - cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS}; + auto const keys = int32s_col{0, 1, 2}; + auto const vals = int32s_col{0, 1, 2}; + + constexpr int NUM_RANDOM_TESTS = 20; + for (int i = 0; i < NUM_RANDOM_TESTS; ++i) { - bool test_run = false; - auto input_attrs = get_subset(all_attrs); - input_attrs.insert(get_random_agg()); - auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run, std::move(input_attrs))); + bool test_run = false; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ true)); std::vector requests; requests.emplace_back(); @@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } } diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu new file mode 100644 index 00000000000..67b88c5306b --- /dev/null +++ b/cpp/tests/reductions/host_udf_example_tests.cu @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +using int64s_col = cudf::test::fixed_width_column_wrapper; + +namespace { +/** + * @brief A host-based UDF implementation for reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_reduction_example : cudf::reduce_host_udf { + host_udf_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher( + input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + if (input.size() == 0) { + return cudf::make_default_constructed_scalar(output_dtype, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto const result = thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + transform_fn{*input_dv_ptr}, + static_cast(init_value), + thrust::plus<>{}); + + auto output = cudf::make_numeric_scalar(output_dtype, stream, mr); + static_cast*>(output.get())->set_value(result, stream); + return output; + } + + struct transform_fn { + cudf::column_device_view values; + OutputType __device__ operator()(cudf::size_type idx) const + { + if (values.is_null(idx)) { return OutputType{0}; } + auto const val = static_cast(values.element(idx)); + return val * val; + } + }; + }; +}; + +} // namespace + +struct HostUDFReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFReductionExampleTest, SimpleInput) +{ + auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_TRUE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); + auto const result = + static_cast*>(reduced.get())->value(cudf::get_default_stream()); + auto constexpr expected = 55; // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55 + EXPECT_EQ(expected, result); +} + +TEST_F(HostUDFReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_FALSE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); +} + +namespace { + +/** + * @brief A host-based UDF implementation for segmented reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf { + host_udf_segmented_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher(input.type(), + output_dtype, + segmented_reduce_fn{}, + input, + offsets, + output_dtype, + null_handling, + init, + stream, + mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct segmented_reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); + auto const num_segments = static_cast(offsets.size()) - 1; + + if (input.size() == 0) { + if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); } + return cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto output = cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream); + + // Store row index if it is valid, otherwise store a negative value denoting a null row. + rmm::device_uvector valid_idx(num_segments, stream); + + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_segments), + thrust::make_zip_iterator(output->mutable_view().begin(), valid_idx.begin()), + transform_fn{*input_dv_ptr, offsets, static_cast(init_value), null_handling}); + + auto const valid_idx_cv = cudf::column_view{ + cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0}; + return std::move(cudf::gather(cudf::table_view{{output->view()}}, + valid_idx_cv, + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release() + .front()); + } + + struct transform_fn { + cudf::column_device_view values; + cudf::device_span offsets; + OutputType init_value; + cudf::null_policy null_handling; + + thrust::tuple __device__ operator()(cudf::size_type idx) const + { + auto const start = offsets[idx]; + auto const end = offsets[idx + 1]; + + auto constexpr invalid_idx = cuda::std::numeric_limits::lowest(); + if (start == end) { return {OutputType{0}, invalid_idx}; } + + auto sum = init_value; + for (auto i = start; i < end; ++i) { + if (values.is_null(i)) { + if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; } + continue; + } + auto const val = static_cast(values.element(i)); + sum += val * val; + } + auto const segment_size = end - start; + return {static_cast(segment_size) * sum, idx}; + } + }; + }; +}; + +} // namespace + +struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput) +{ + double constexpr null = 0.0; + auto const vals = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0}, + {true, false, true, true, false, true, false, false, true, true}}; + auto const offsets = int32s_col{0, 3, 5, 10}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + + // Test without init value. + { + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // Since init value is not given, it is set to 0. + // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{12, 18, 850}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and include nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{48, 42, 955}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and exclude nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::EXCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{21, 24, 865}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{0, 0, 0, 0}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{{0, 0, 0}, {false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 2276b223740..c07a58ed8a5 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java index 27966ddfdd4..234a9ec1ced 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java index ba8ae379bae..4f047a68f06 100644 --- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,14 +161,14 @@ public static ReductionAggregation median() { /** * Aggregate to compute the specified quantiles. Uses linear interpolation by default. */ - public static ReductionAggregation quantile(double ... quantiles) { + public static ReductionAggregation quantile(double... quantiles) { return new ReductionAggregation(Aggregation.quantile(quantiles)); } /** * Aggregate to compute various quantiles. */ - public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) { + public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) { return new ReductionAggregation(Aggregation.quantile(method, quantiles)); } @@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() { * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. */ public static ReductionAggregation collectSet(NullPolicy nullPolicy, - NullEquality nullEquality, NaNEquality nanEquality) { + NullEquality nullEquality, NaNEquality nanEquality) { return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality)); } @@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality)); } + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new ReductionAggregation instance + */ + public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new ReductionAggregation(Aggregation.hostUDF(wrapper)); + } + /** * Create HistogramAggregation, computing the frequencies for each unique row. * diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java index 7ed150a2fec..18e7d874886 100644 --- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() { public static SegmentedReductionAggregation all() { return new SegmentedReductionAggregation(Aggregation.all()); } + + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new SegmentedReductionAggregation instance + */ + public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper)); + } } From bbf4f7824c23c0c482f52bafdf1ece1213da2f65 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 13 Jan 2025 11:44:54 -0800 Subject: [PATCH 08/12] Host compression (#17656) Add compression APIs to make the nvCOMP use transparent. Remove direct dependency on nvCOMP in the ORC and Parquet writers. Add multi-threaded host-side compression; currently off by default, can only be enabled via `LIBCUDF_USE_HOST_COMPRESSION` environment variable. Currently the host compression adds D2H + H2D transfers. Avoiding the H2D transfer requires large changes to the writers. Also moved handling of the AUTO compression type to the options classes, which should own such defaults (translate AUTO to SNAPPY in this case). Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17656 --- cpp/CMakeLists.txt | 2 +- cpp/include/cudf/io/orc.hpp | 22 ++- cpp/src/io/comp/comp.cpp | 163 ++++++++++++++++++++- cpp/src/io/comp/{statistics.cu => comp.cu} | 4 +- cpp/src/io/comp/comp.hpp | 54 ++++++- cpp/src/io/comp/gpuinflate.hpp | 15 +- cpp/src/io/functions.cpp | 3 +- cpp/src/io/orc/orc_gpu.hpp | 4 +- cpp/src/io/orc/stripe_enc.cu | 49 +------ cpp/src/io/orc/writer_impl.cu | 144 ++++++------------ cpp/src/io/orc/writer_impl.hpp | 4 +- cpp/src/io/parquet/writer_impl.cu | 99 +++++-------- cpp/src/io/parquet/writer_impl.hpp | 4 +- cpp/src/io/parquet/writer_impl_helpers.cpp | 46 +----- cpp/src/io/parquet/writer_impl_helpers.hpp | 38 +---- cpp/tests/io/orc_test.cpp | 3 +- cpp/tests/io/parquet_misc_test.cpp | 3 +- 17 files changed, 338 insertions(+), 319 deletions(-) rename cpp/src/io/comp/{statistics.cu => comp.cu} (96%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dabe4e8800..252cc7897d8 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -477,13 +477,13 @@ add_library( src/io/avro/reader_impl.cu src/io/comp/brotli_dict.cpp src/io/comp/comp.cpp + src/io/comp/comp.cu src/io/comp/cpu_unbz2.cpp src/io/comp/debrotli.cu src/io/comp/gpuinflate.cu src/io/comp/nvcomp_adapter.cpp src/io/comp/nvcomp_adapter.cu src/io/comp/snap.cu - src/io/comp/statistics.cu src/io/comp/uncomp.cpp src/io/comp/unsnap.cu src/io/csv/csv_gpu.cu diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 163fa20806d..82f7761da2e 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -578,7 +578,7 @@ class orc_writer_options { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use - compression_type _compression = compression_type::AUTO; + compression_type _compression = compression_type::SNAPPY; // Specify frequency of statistics collection statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) @@ -733,7 +733,11 @@ class orc_writer_options { * * @param comp Compression type */ - void set_compression(compression_type comp) { _compression = comp; } + void set_compression(compression_type comp) + { + _compression = comp; + if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; } + } /** * @brief Choose granularity of statistics collection. @@ -865,7 +869,7 @@ class orc_writer_options_builder { */ orc_writer_options_builder& compression(compression_type comp) { - options._compression = comp; + options.set_compression(comp); return *this; } @@ -1026,7 +1030,7 @@ class chunked_orc_writer_options { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use - compression_type _compression = compression_type::AUTO; + compression_type _compression = compression_type::SNAPPY; // Specify granularity of statistics collection statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) @@ -1157,7 +1161,11 @@ class chunked_orc_writer_options { * * @param comp The compression type to use */ - void set_compression(compression_type comp) { _compression = comp; } + void set_compression(compression_type comp) + { + _compression = comp; + if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; } + } /** * @brief Choose granularity of statistics collection @@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder { */ chunked_orc_writer_options_builder& compression(compression_type comp) { - options._compression = comp; + options.set_compression(comp); return *this; } diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp index 26535bed43b..3800835eaf1 100644 --- a/cpp/src/io/comp/comp.cpp +++ b/cpp/src/io/comp/comp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,45 @@ #include "comp.hpp" +#include "gpuinflate.hpp" +#include "io/utilities/getenv_or.hpp" #include "io/utilities/hostdevice_vector.hpp" #include "nvcomp_adapter.hpp" #include #include +#include #include #include #include #include +#include #include // GZIP compression namespace cudf::io::detail { namespace { +auto& h_comp_pool() +{ + static std::size_t pool_size = + getenv_or("LIBCUDF_HOST_COMPRESSION_NUM_THREADS", std::thread::hardware_concurrency()); + static BS::thread_pool pool(pool_size); + return pool; +} + +std::optional to_nvcomp_compression(compression_type compression) +{ + switch (compression) { + case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY; + case compression_type::ZSTD: return nvcomp::compression_type::ZSTD; + case compression_type::LZ4: return nvcomp::compression_type::LZ4; + case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE; + default: return std::nullopt; + } +} + /** * @brief GZIP host compressor (includes header) */ @@ -98,8 +121,132 @@ std::vector compress_snappy(host_span src, return cudf::detail::make_std_vector_sync(d_dst, stream); } +void device_compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + if (compression == compression_type::NONE) { return; } + + auto const nvcomp_type = to_nvcomp_compression(compression); + auto nvcomp_disabled = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type) + : "invalid compression type"; + if (not nvcomp_disabled) { + return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream); + } + + switch (compression) { + case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream); + default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value()); + } +} + +void host_compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + if (compression == compression_type::NONE) { return; } + + auto const num_chunks = inputs.size(); + auto h_results = cudf::detail::make_host_vector(num_chunks, stream); + auto const h_inputs = cudf::detail::make_host_vector_async(inputs, stream); + auto const h_outputs = cudf::detail::make_host_vector_async(outputs, stream); + stream.synchronize(); + + std::vector> tasks; + auto const num_streams = + std::min({num_chunks, + cudf::detail::global_cuda_stream_pool().get_stream_pool_size(), + h_comp_pool().get_thread_count()}); + auto const streams = cudf::detail::fork_streams(stream, num_streams); + for (size_t i = 0; i < num_chunks; ++i) { + auto const cur_stream = streams[i % streams.size()]; + auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t { + auto const h_in = cudf::detail::make_host_vector_sync(d_in, cur_stream); + auto const h_out = compress(compression, h_in, cur_stream); + cudf::detail::cuda_memcpy(d_out.subspan(0, h_out.size()), h_out, cur_stream); + return h_out.size(); + }; + tasks.emplace_back(h_comp_pool().submit_task(std::move(task))); + } + + for (auto i = 0ul; i < num_chunks; ++i) { + h_results[i] = {tasks[i].get(), compression_status::SUCCESS}; + } + cudf::detail::cuda_memcpy_async(results, h_results, stream); +} + +[[nodiscard]] bool host_compression_supported(compression_type compression) +{ + switch (compression) { + case compression_type::GZIP: + case compression_type::NONE: return true; + default: return false; + } +} + +[[nodiscard]] bool device_compression_supported(compression_type compression) +{ + auto const nvcomp_type = to_nvcomp_compression(compression); + switch (compression) { + case compression_type::LZ4: + case compression_type::ZLIB: + case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value()); + case compression_type::SNAPPY: + case compression_type::NONE: return true; + default: return false; + } +} + +[[nodiscard]] bool use_host_compression( + compression_type compression, + [[maybe_unused]] device_span const> inputs, + [[maybe_unused]] device_span const> outputs) +{ + CUDF_EXPECTS( + not host_compression_supported(compression) or device_compression_supported(compression), + "Unsupported compression type"); + if (not host_compression_supported(compression)) { return false; } + if (not device_compression_supported(compression)) { return true; } + // If both host and device compression are supported, use the host if the env var is set + return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0); +} + } // namespace +std::optional compress_max_allowed_chunk_size(compression_type compression) +{ + if (auto nvcomp_type = to_nvcomp_compression(compression); + nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) { + return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type); + } + return std::nullopt; +} + +[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression) +{ + auto nvcomp_type = to_nvcomp_compression(compression); + if (compression == compression_type::NONE or not nvcomp_type.has_value() or + nvcomp::is_compression_disabled(*nvcomp_type)) { + return 1ul; + } + + return nvcomp::required_alignment(*nvcomp_type); +} + +[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size) +{ + if (compression == compression_type::NONE) { return uncompressed_size; } + + if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) { + return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size); + } + CUDF_FAIL("Unsupported compression type"); +} + std::vector compress(compression_type compression, host_span src, rmm::cuda_stream_view stream) @@ -112,4 +259,18 @@ std::vector compress(compression_type compression, } } +void compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + if (use_host_compression(compression, inputs, outputs)) { + return host_compress(compression, inputs, outputs, results, stream); + } else { + return device_compress(compression, inputs, outputs, results, stream); + } +} + } // namespace cudf::io::detail diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/comp.cu similarity index 96% rename from cpp/src/io/comp/statistics.cu rename to cpp/src/io/comp/comp.cu index caee9145d2c..af0f73869a2 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/comp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "gpuinflate.hpp" +#include "comp.hpp" #include diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp index e16f26e1f06..90932a11499 100644 --- a/cpp/src/io/comp/comp.hpp +++ b/cpp/src/io/comp/comp.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,5 +57,57 @@ std::vector compress(compression_type compression, host_span src, rmm::cuda_stream_view stream); +/** + * @brief Maximum size of uncompressed chunks that can be compressed. + * + * @param compression Compression type + * @returns maximum chunk size + */ +[[nodiscard]] std::optional compress_max_allowed_chunk_size(compression_type compression); + +/** + * @brief Gets input and output alignment requirements for the given compression type. + * + * @param compression Compression type + * @returns required alignment + */ +[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression); + +/** + * @brief Gets the maximum size any chunk could compress to in the batch. + * + * @param compression Compression type + * @param uncompressed_size Size of the largest uncompressed chunk in the batch + */ +[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size); + +/** + * @brief Compresses device memory buffers. + * + * @param compression Type of compression of the input data + * @param inputs Device memory buffers to compress + * @param outputs Device memory buffers to store the compressed output + * @param results Compression results + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream); + +/** + * @brief Aggregate results of compression into a single statistics object. + * + * @param inputs List of uncompressed input buffers + * @param results List of compression results + * @param stream CUDA stream to use + * @return writer_compression_statistics + */ +[[nodiscard]] writer_compression_statistics collect_compression_statistics( + device_span const> inputs, + device_span results, + rmm::cuda_stream_view stream); + } // namespace io::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index 4b09bd5a84c..0a35b230242 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -124,17 +124,4 @@ void gpu_snap(device_span const> inputs, device_span results, rmm::cuda_stream_view stream); -/** - * @brief Aggregate results of compression into a single statistics object. - * - * @param inputs List of uncompressed input buffers - * @param results List of compression results - * @param stream CUDA stream to use - * @return writer_compression_statistics - */ -[[nodiscard]] writer_compression_statistics collect_compression_statistics( - device_span const> inputs, - device_span results, - rmm::cuda_stream_view stream); - } // namespace cudf::io::detail diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 88423122e16..d63fa9f5c35 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l void parquet_writer_options_base::set_compression(compression_type compression) { _compression = compression; + if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; } } void parquet_writer_options_base::enable_int96_timestamps(bool req) diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index f4e75f78dec..8b30cee6681 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -407,7 +407,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, std::optional CompressOrcDataStreams( device_span compressed_data, uint32_t num_compressed_blocks, - CompressionKind compression, + compression_type compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, uint32_t comp_block_align, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 79ecca0ca99..4f296bb5bfc 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include "io/comp/gpuinflate.hpp" -#include "io/comp/nvcomp_adapter.hpp" #include "io/utilities/block_utils.cuh" #include "io/utilities/time_utils.cuh" #include "orc_gpu.hpp" @@ -45,8 +44,6 @@ namespace io { namespace orc { namespace gpu { -namespace nvcomp = cudf::io::detail::nvcomp; - using cudf::detail::device_2dspan; using cudf::io::detail::compression_result; using cudf::io::detail::compression_status; @@ -1362,7 +1359,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, std::optional CompressOrcDataStreams( device_span compressed_data, uint32_t num_compressed_blocks, - CompressionKind compression, + compression_type compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, uint32_t comp_block_align, @@ -1387,47 +1384,7 @@ std::optional CompressOrcDataStreams( max_comp_blk_size, comp_block_align); - if (compression == SNAPPY) { - try { - if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { - cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream); - } else { - nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } - } catch (...) { - // There was an error in compressing so set an error status for each block - thrust::for_each( - rmm::exec_policy(stream), - comp_res.begin(), - comp_res.end(), - [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; }); - // Since SNAPPY is the default compression (may not be explicitly requested), fall back to - // writing without compression - CUDF_LOG_WARN("ORC writer: compression failed, writing uncompressed data"); - } - } else if (compression == ZLIB) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress( - nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream); - } else if (compression == ZSTD) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); - } else if (compression == LZ4) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream); - } else if (compression != NONE) { - CUDF_FAIL("Unsupported compression type"); - } + cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream); dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ce868b83c04..aa0b509981a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ * @brief cuDF-IO ORC writer class implementation */ -#include "io/comp/nvcomp_adapter.hpp" #include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" @@ -71,8 +70,6 @@ namespace cudf::io::orc::detail { -namespace nvcomp = cudf::io::detail::nvcomp; - template [[nodiscard]] constexpr int varint_size(T val) { @@ -92,21 +89,8 @@ struct row_group_index_info { }; namespace { - /** - * @brief Translates ORC compression to nvCOMP compression - */ -auto to_nvcomp_compression_type(CompressionKind compression_kind) -{ - if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; - if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; - if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; - if (compression_kind == LZ4) return nvcomp::compression_type::LZ4; - CUDF_FAIL("Unsupported compression type"); -} - -/** - * @brief Translates cuDF compression to ORC compression + * @brief Translates cuDF compression to ORC compression. */ orc::CompressionKind to_orc_compression(compression_type compression) { @@ -122,19 +106,14 @@ orc::CompressionKind to_orc_compression(compression_type compression) } /** - * @brief Returns the block size for a given compression kind. + * @brief Returns the block size for a given compression format. */ -constexpr size_t compression_block_size(orc::CompressionKind compression) +size_t compression_block_size(compression_type compression) { - if (compression == orc::CompressionKind::NONE) { return 0; } - - auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) - ? std::nullopt - : nvcomp::compress_max_allowed_chunk_size(ncomp_type); + auto const comp_limit = compress_max_allowed_chunk_size(compression); constexpr size_t max_block_size = 256 * 1024; - return std::min(nvcomp_limit.value_or(max_block_size), max_block_size); + return std::min(comp_limit.value_or(max_block_size), max_block_size); } /** @@ -534,26 +513,6 @@ size_t RLE_stream_size(TypeKind kind, size_t count) } } -auto uncomp_block_alignment(CompressionKind compression_kind) -{ - if (compression_kind == NONE or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { - return 1ul; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind)); -} - -auto comp_block_alignment(CompressionKind compression_kind) -{ - if (compression_kind == NONE or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { - return 1ul; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind)); -} - /** * @brief Builds up per-column streams. * @@ -566,7 +525,7 @@ orc_streams create_streams(host_span columns, file_segmentation const& segmentation, std::map const& decimal_column_sizes, bool enable_dictionary, - CompressionKind compression_kind, + compression_type compression, single_write_mode write_mode) { // 'column 0' row index stream @@ -610,7 +569,7 @@ orc_streams create_streams(host_span columns, auto add_stream = [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { - auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1; + auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1; const auto base = column.index() * gpu::CI_NUM_STREAMS; ids[base + index_type] = streams.size(); streams.push_back(orc::Stream{ @@ -1473,7 +1432,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, * @param[in] rg_stats row group level statistics * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams - * @param[in] compression_kind The compression kind + * @param[in] compression The compression format * @param[in] compression_blocksize The block size used for compression * @param[in] out_sink Sink for writing data */ @@ -1487,7 +1446,7 @@ void write_index_stream(int32_t stripe_id, host_span rg_stats, StripeInformation* stripe, orc_streams* streams, - CompressionKind compression_kind, + compression_type compression, size_t compression_blocksize, std::unique_ptr const& out_sink) { @@ -1501,7 +1460,7 @@ void write_index_stream(int32_t stripe_id, row_group_index_info record; if (stream.ids[type] > 0) { record.pos = 0; - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { auto const& ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; record.blk_pos = ss.first_block; record.comp_pos = 0; @@ -1541,7 +1500,7 @@ void write_index_stream(int32_t stripe_id, } } - ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((compression != compression_type::NONE) ? 3 : 0); // Add row index entries auto const& rowgroups_range = segmentation.stripes[stripe_id]; @@ -1566,7 +1525,7 @@ void write_index_stream(int32_t stripe_id, }); (*streams)[stream_id].length = pbw.size(); - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1; pbw.buffer()[0] = static_cast(uncomp_ix_len >> 0); pbw.buffer()[1] = static_cast(uncomp_ix_len >> 8); @@ -1585,7 +1544,7 @@ void write_index_stream(int32_t stripe_id, * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams - * @param[in] compression_kind The compression kind + * @param[in] compression The compression format * @param[in] out_sink Sink for writing data * @param[in] stream CUDA stream used for device memory operations and kernel launches * @return An std::future that should be synchronized to ensure the writing is complete @@ -1596,7 +1555,7 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, host_span bounce_buffer, StripeInformation* stripe, orc_streams* streams, - CompressionKind compression_kind, + compression_type compression, std::unique_ptr const& out_sink, rmm::cuda_stream_view stream) { @@ -1606,8 +1565,9 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, return std::async(std::launch::deferred, [] {}); } - auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type] - : (compressed_data + strm_desc.bfr_offset); + auto const* stream_in = (compression == compression_type::NONE) + ? enc_stream.data_ptrs[strm_desc.stream_type] + : (compressed_data + strm_desc.bfr_offset); auto write_task = [&]() { if (out_sink->is_device_write_preferred(length)) { @@ -1627,15 +1587,15 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, /** * @brief Insert 3-byte uncompressed block headers in a byte vector * - * @param compression_kind The compression kind + * @param compression The compression kind * @param compression_blocksize The block size used for compression * @param v The destitation byte vector to write, which must include initial 3-byte header */ -void add_uncompressed_block_headers(CompressionKind compression_kind, +void add_uncompressed_block_headers(compression_type compression, size_t compression_blocksize, std::vector& v) { - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { size_t uncomp_len = v.size() - 3, pos = 0, block_len; while (uncomp_len > compression_blocksize) { block_len = compression_blocksize * 2 + 1; @@ -2021,14 +1981,6 @@ std::map decimal_column_sizes( return column_sizes; } -size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize) -{ - if (compression_kind == NONE) return 0; - - return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), - compression_blocksize); -} - std::unique_ptr make_table_meta(table_view const& input) { auto table_meta = std::make_unique(input); @@ -2287,7 +2239,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, * @param row_index_stride The row index stride * @param enable_dictionary Whether dictionary is enabled * @param sort_dictionaries Whether to sort the dictionaries - * @param compression_kind The compression kind + * @param compression The compression format * @param compression_blocksize The block size used for compression * @param stats_freq Column statistics granularity type for parquet/orc writers * @param collect_compression_stats Flag to indicate if compression statistics should be collected @@ -2302,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input, size_type row_index_stride, bool enable_dictionary, bool sort_dictionaries, - CompressionKind compression_kind, + compression_type compression, size_t compression_blocksize, statistics_freq stats_freq, bool collect_compression_stats, @@ -2329,17 +2281,16 @@ auto convert_table_to_orc_data(table_view const& input, auto stripe_dicts = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream); auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); - auto const uncompressed_block_align = uncomp_block_alignment(compression_kind); - auto const compressed_block_align = comp_block_alignment(compression_kind); + auto const block_align = compress_required_chunk_alignment(compression); auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes), enable_dictionary, - compression_kind, + compression, write_mode); auto enc_data = encode_columns( - orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream); + orc_table, std::move(dec_chunk_sizes), segmentation, streams, block_align, stream); stripe_dicts.on_encode_complete(stream); @@ -2371,16 +2322,15 @@ auto convert_table_to_orc_data(table_view const& input, size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; - auto const max_compressed_block_size = - max_compression_output_size(compression_kind, compression_blocksize); + auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize); auto const padded_max_compressed_block_size = - util::round_up_unsafe(max_compressed_block_size, compressed_block_align); + util::round_up_unsafe(max_compressed_block_size, block_align); auto const padded_block_header_size = - util::round_up_unsafe(block_header_size, compressed_block_align); + util::round_up_unsafe(block_header_size, block_align); for (auto& ss : strm_descs.host_view().flat_view()) { size_t stream_size = ss.stream_size; - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { ss.first_block = num_compressed_blocks; ss.bfr_offset = compressed_bfr_size; @@ -2401,14 +2351,14 @@ auto convert_table_to_orc_data(table_view const& input, comp_results.d_begin(), comp_results.d_end(), compression_result{0, compression_status::FAILURE}); - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { strm_descs.host_to_device_async(stream); compression_stats = gpu::CompressOrcDataStreams(compressed_data, num_compressed_blocks, - compression_kind, + compression, compression_blocksize, max_compressed_block_size, - compressed_block_align, + block_align, collect_compression_stats, strm_descs, enc_data.streams, @@ -2459,8 +2409,8 @@ writer::impl::impl(std::unique_ptr sink, : _stream(stream), _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, _row_index_stride{options.get_row_index_stride()}, - _compression_kind(to_orc_compression(options.get_compression())), - _compression_blocksize(compression_block_size(_compression_kind)), + _compression{options.get_compression()}, + _compression_blocksize(compression_block_size(_compression)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), _sort_dictionaries{options.get_enable_dictionary_sort()}, @@ -2480,8 +2430,8 @@ writer::impl::impl(std::unique_ptr sink, : _stream(stream), _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, _row_index_stride{options.get_row_index_stride()}, - _compression_kind(to_orc_compression(options.get_compression())), - _compression_blocksize(compression_block_size(_compression_kind)), + _compression{options.get_compression()}, + _compression_blocksize(compression_block_size(_compression)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), _sort_dictionaries{options.get_enable_dictionary_sort()}, @@ -2526,7 +2476,7 @@ void writer::impl::write(table_view const& input) _row_index_stride, _enable_dictionary, _sort_dictionaries, - _compression_kind, + _compression, _compression_blocksize, _stats_freq, _compression_statistics != nullptr, @@ -2613,7 +2563,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, rg_stats, &stripe, &streams, - _compression_kind, + _compression, _compression_blocksize, _out_sink); } @@ -2627,7 +2577,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, bounce_buffer, &stripe, &streams, - _compression_kind, + _compression, _out_sink, _stream)); } @@ -2645,10 +2595,10 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, : 0; if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; } } - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(sf); stripe.footerLength = pbw.size(); - if (_compression_kind != NONE) { + if (_compression != compression_type::NONE) { uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1; pbw.buffer()[0] = static_cast(uncomp_sf_len >> 0); pbw.buffer()[1] = static_cast(uncomp_sf_len >> 8); @@ -2780,21 +2730,21 @@ void writer::impl::close() // Write statistics metadata if (not _orc_meta.stripeStats.empty()) { - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(_orc_meta); - add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer()); + add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer()); ps.metadataLength = pbw.size(); _out_sink->host_write(pbw.data(), pbw.size()); } else { ps.metadataLength = 0; } - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(_footer); - add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer()); + add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer()); // Write postscript metadata ps.footerLength = pbw.size(); - ps.compression = _compression_kind; + ps.compression = to_orc_compression(_compression); ps.compressionBlockSize = _compression_blocksize; ps.version = {0, 12}; // Hive 0.12 ps.writerVersion = cudf_writer_version; diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index cae849ee315..7d23482cb17 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -342,7 +342,7 @@ class writer::impl { // Writer options. stripe_size_limits const _max_stripe_size; size_type const _row_index_stride; - CompressionKind const _compression_kind; + compression_type const _compression; size_t const _compression_blocksize; std::shared_ptr _compression_statistics; // Optional output statistics_freq const _stats_freq; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 77924ac0f35..1b67b53ae8e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -23,8 +23,7 @@ #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "interop/decimal_conversion_utilities.cuh" -#include "io/comp/gpuinflate.hpp" -#include "io/comp/nvcomp_adapter.hpp" +#include "io/comp/comp.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" @@ -67,6 +66,20 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; +Compression to_parquet_compression(compression_type compression) +{ + switch (compression) { + case compression_type::AUTO: + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::LZ4: + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + return Compression::LZ4_RAW; + case compression_type::NONE: return Compression::UNCOMPRESSED; + default: CUDF_FAIL("Unsupported compression type"); + } +} + struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, host_span const> kv_md, @@ -1172,7 +1185,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, size_t max_page_size_bytes, size_type max_page_size_rows, bool write_v2_headers, - Compression compression_codec, + compression_type compression, rmm::cuda_stream_view stream) { if (chunks.is_empty()) { return cudf::detail::hostdevice_vector{}; } @@ -1187,7 +1200,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1212,7 +1225,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1221,12 +1234,10 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Get per-page max compressed size cudf::detail::hostdevice_vector comp_page_sizes(num_pages, stream); - std::transform(page_sizes.begin(), - page_sizes.end(), - comp_page_sizes.begin(), - [compression_codec](auto page_size) { - return max_compression_output_size(compression_codec, page_size); - }); + std::transform( + page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [compression](auto page_size) { + return max_compressed_size(compression, page_size); + }); comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size @@ -1238,7 +1249,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1247,16 +1258,13 @@ auto init_page_sizes(hostdevice_2dvector& chunks, return comp_page_sizes; } -size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) +size_t max_page_bytes(compression_type compression, size_t max_page_size_bytes) { - if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == compression_type::NONE) { return max_page_size_bytes; } - auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) - ? std::nullopt - : nvcomp::compress_max_allowed_chunk_size(ncomp_type); + auto const comp_limit = compress_max_allowed_chunk_size(compression); - auto max_size = std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes); + auto max_size = std::min(comp_limit.value_or(max_page_size_bytes), max_page_size_bytes); // page size must fit in a 32-bit signed integer return std::min(max_size, std::numeric_limits::max()); } @@ -1265,7 +1273,7 @@ std::pair>, std::vector& chunks, host_span col_desc, device_2dspan frags, - Compression compression, + compression_type compression, dictionary_policy dict_policy, size_t max_dict_size, rmm::cuda_stream_view stream) @@ -1404,7 +1412,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param num_columns Total number of columns * @param num_pages Total number of pages * @param num_stats_bfr Number of statistics buffers - * @param compression Compression format + * @param alignment Page alignment * @param max_page_size_bytes Maximum uncompressed page size, in bytes * @param max_page_size_rows Maximum page size, in rows * @param write_v2_headers True if version 2 page headers are to be written @@ -1419,7 +1427,7 @@ void init_encoder_pages(hostdevice_2dvector& chunks, uint32_t num_columns, uint32_t num_pages, uint32_t num_stats_bfr, - Compression compression, + size_t alignment, size_t max_page_size_bytes, size_type max_page_size_rows, bool write_v2_headers, @@ -1435,7 +1443,7 @@ void init_encoder_pages(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression), + alignment, write_v2_headers, (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, @@ -1478,7 +1486,7 @@ void encode_pages(hostdevice_2dvector& chunks, statistics_chunk const* chunk_stats, statistics_chunk const* column_stats, std::optional& comp_stats, - Compression compression, + compression_type compression, int32_t column_index_truncate_length, bool write_v2_headers, rmm::cuda_stream_view stream) @@ -1488,7 +1496,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats, num_pages) : device_span(); - uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0; + uint32_t max_comp_pages = (compression != compression_type::NONE) ? num_pages : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1499,34 +1507,7 @@ void encode_pages(hostdevice_2dvector& chunks, compression_result{0, compression_status::FAILURE}); EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream); - switch (compression) { - case Compression::SNAPPY: - if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { - gpu_snap(comp_in, comp_out, comp_res, stream); - } else { - nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } - break; - case Compression::ZSTD: { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); - break; - } - case Compression::LZ4_RAW: { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream); - break; - } - case Compression::UNCOMPRESSED: break; - default: CUDF_FAIL("invalid compression type"); - } + compress(compression, comp_in, comp_out, comp_res, stream); // TBD: Not clear if the official spec actually allows dynamically turning off compression at the // chunk-level @@ -1744,7 +1725,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type max_page_size_rows, int32_t column_index_truncate_length, statistics_freq stats_granularity, - Compression compression, + compression_type compression, bool collect_compression_statistics, dictionary_policy dict_policy, size_t max_dictionary_size, @@ -2146,7 +2127,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == compression_type::NONE) { max_comp_bfr_size = 0; } // Initialize data pointers uint32_t const num_stats_bfr = @@ -2214,7 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, num_columns, num_pages, num_stats_bfr, - compression, + compress_required_chunk_alignment(compression), max_page_size_bytes, max_page_size_rows, write_v2_headers, @@ -2270,7 +2251,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const dev_bfr = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr; auto& column_chunk_meta = row_group.columns[i].meta_data; - if (ck.is_compressed) { column_chunk_meta.codec = compression; } + if (ck.is_compressed) { column_chunk_meta.codec = to_parquet_compression(compression); } if (!out_sink[p]->is_device_write_preferred(ck.compressed_size)) { all_device_write = false; } @@ -2375,7 +2356,7 @@ writer::impl::impl(std::vector> sinks, single_write_mode mode, rmm::cuda_stream_view stream) : _stream(stream), - _compression(to_parquet_compression(options.get_compression())), + _compression(options.get_compression()), _max_row_group_size{options.get_row_group_size_bytes()}, _max_row_group_rows{options.get_row_group_size_rows()}, _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())), @@ -2406,7 +2387,7 @@ writer::impl::impl(std::vector> sinks, single_write_mode mode, rmm::cuda_stream_view stream) : _stream(stream), - _compression(to_parquet_compression(options.get_compression())), + _compression(options.get_compression()), _max_row_group_size{options.get_row_group_size_bytes()}, _max_row_group_rows{options.get_row_group_size_rows()}, _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())), diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 63128faf993..d5a5a534b93 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,7 +144,7 @@ class writer::impl { rmm::cuda_stream_view _stream; // Writer options. - Compression const _compression; + compression_type const _compression; size_t const _max_row_group_size; size_type const _max_row_group_rows; size_t const _max_page_size_bytes; diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index f15ea1f3c37..ede788c97c2 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,6 @@ #include "writer_impl_helpers.hpp" -#include "io/comp/nvcomp_adapter.hpp" - #include #include #include @@ -32,48 +30,6 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; -Compression to_parquet_compression(compression_type compression) -{ - switch (compression) { - case compression_type::AUTO: - case compression_type::SNAPPY: return Compression::SNAPPY; - case compression_type::ZSTD: return Compression::ZSTD; - case compression_type::LZ4: - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - return Compression::LZ4_RAW; - case compression_type::NONE: return Compression::UNCOMPRESSED; - default: CUDF_FAIL("Unsupported compression type"); - } -} - -nvcomp::compression_type to_nvcomp_compression_type(Compression codec) -{ - switch (codec) { - case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY; - case Compression::ZSTD: return nvcomp::compression_type::ZSTD; - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4; - default: CUDF_FAIL("Unsupported compression type"); - } -} - -uint32_t page_alignment(Compression codec) -{ - if (codec == Compression::UNCOMPRESSED or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { - return 1u; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(codec)); -} - -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) -{ - if (codec == Compression::UNCOMPRESSED) return 0; - - return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); -} - void fill_table_meta(table_input_metadata& table_meta) { // Fill unnamed columns' names in table_meta diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 14a9a0ed5b7..b5c73c348fe 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,48 +20,12 @@ */ #pragma once -#include "parquet_common.hpp" #include #include -#include namespace cudf::io::parquet::detail { -/** - * @brief Function that translates GDF compression to parquet compression. - * - * @param compression The compression type - * @return The supported Parquet compression - */ -Compression to_parquet_compression(compression_type compression); - -/** - * @brief Function that translates the given compression codec to nvcomp compression type. - * - * @param codec Compression codec - * @return Translated nvcomp compression type - */ -cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec); - -/** - * @brief Function that computes input alignment requirements for the given compression type. - * - * @param codec Compression codec - * @return Required alignment - */ -uint32_t page_alignment(Compression codec); - -/** - * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the - * batch. - * - * @param codec Compression codec - * @param compression_blocksize Size of the largest uncompressed chunk in the batch - * @return Maximum compressed chunk size - */ -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize); - /** * @brief Fill the table metadata with default column names. * diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 2209a30149d..708c2045a74 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2068,6 +2068,7 @@ TEST_P(OrcCompressionTest, Basic) INSTANTIATE_TEST_CASE_P(OrcCompressionTest, OrcCompressionTest, ::testing::Values(cudf::io::compression_type::NONE, + cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, cudf::io::compression_type::ZSTD)); diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index d66f685cd9c..419ac909ac6 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -268,6 +268,7 @@ TEST_P(ParquetCompressionTest, Basic) INSTANTIATE_TEST_CASE_P(ParquetCompressionTest, ParquetCompressionTest, ::testing::Values(cudf::io::compression_type::NONE, + cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, cudf::io::compression_type::ZSTD)); From 478ec50edf302a338db043039abad6a2560144ea Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 13 Jan 2025 15:19:44 -0600 Subject: [PATCH 09/12] Precompute AST arity (#17234) This PR precomputes AST arity on the host, to reduce the complexity in device-side arity lookup. Authors: - Bradley Dice (https://github.com/bdice) - Basit Ayantunde (https://github.com/lamarrr) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Kyle Edwards (https://github.com/KyleFromNVIDIA) URL: https://github.com/rapidsai/cudf/pull/17234 --- cpp/CMakeLists.txt | 1 + .../cudf/ast/detail/expression_evaluator.cuh | 4 +- .../cudf/ast/detail/expression_parser.hpp | 50 ++- cpp/include/cudf/ast/detail/operators.hpp | 418 +++--------------- cpp/src/ast/expression_parser.cpp | 3 +- cpp/src/ast/operators.cpp | 293 ++++++++++++ 6 files changed, 391 insertions(+), 378 deletions(-) create mode 100644 cpp/src/ast/operators.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 252cc7897d8..4d83cbd907c 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -336,6 +336,7 @@ add_library( src/aggregation/result_cache.cpp src/ast/expression_parser.cpp src/ast/expressions.cpp + src/ast/operators.cpp src/binaryop/binaryop.cpp src/binaryop/compiled/ATan2.cu src/binaryop/compiled/Add.cu diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index 9d8762555d7..001b604814c 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -452,7 +452,7 @@ struct expression_evaluator { ++operator_index) { // Execute operator auto const op = plan.operators[operator_index]; - auto const arity = ast_operator_arity(op); + auto const arity = plan.operator_arities[operator_index]; if (arity == 1) { // Unary operator auto const& input = diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index b5973d0ace9..d2e8c1cd41f 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -88,6 +89,7 @@ struct expression_device_view { device_span data_references; device_span literals; device_span operators; + device_span operator_arities; device_span operator_source_indices; cudf::size_type num_intermediates; }; @@ -229,39 +231,55 @@ class expression_parser { * @param[in] v The `std::vector` containing components (operators, literals, etc). * @param[in,out] sizes The `std::vector` containing the size of each data buffer. * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + * @param[in,out] alignment The maximum alignment needed for all the extracted size and pointers */ template void extract_size_and_pointer(std::vector const& v, std::vector& sizes, - std::vector& data_pointers) + std::vector& data_pointers, + cudf::size_type& alignment) { + // sub-type alignment will only work provided the alignment is lesser or equal to + // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers + static_assert(alignof(T) <= alignof(max_align_t)); auto const data_size = sizeof(T) * v.size(); sizes.push_back(data_size); data_pointers.push_back(v.data()); + alignment = std::max(alignment, static_cast(alignof(T))); } void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { std::vector sizes; std::vector data_pointers; + // use a minimum of 4-byte alignment + cudf::size_type buffer_alignment = 4; - extract_size_and_pointer(_data_references, sizes, data_pointers); - extract_size_and_pointer(_literals, sizes, data_pointers); - extract_size_and_pointer(_operators, sizes, data_pointers); - extract_size_and_pointer(_operator_source_indices, sizes, data_pointers); + extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment); // Create device buffer - auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); - auto buffer_offsets = std::vector(sizes.size()); - thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); + auto buffer_offsets = std::vector(sizes.size()); + thrust::exclusive_scan(sizes.cbegin(), + sizes.cend(), + buffer_offsets.begin(), + cudf::size_type{0}, + [buffer_alignment](auto a, auto b) { + // align each component of the AST program + return cudf::util::round_up_safe(a + b, buffer_alignment); + }); + + auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back()); + auto host_data_buffer = std::vector(buffer_size); - auto h_data_buffer = std::vector(buffer_size); for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); + std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); } - _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr); - + _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr); stream.synchronize(); // Create device pointers to components of plan @@ -277,8 +295,11 @@ class expression_parser { device_expression_data.operators = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), _operators.size()); - device_expression_data.operator_source_indices = device_span( + device_expression_data.operator_arities = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _operators.size()); + device_expression_data.operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[4]), _operator_source_indices.size()); device_expression_data.num_intermediates = _intermediate_counter.get_max_used(); shmem_per_thread = static_cast( @@ -322,6 +343,7 @@ class expression_parser { bool _has_nulls; std::vector _data_references; std::vector _operators; + std::vector _operator_arities; std::vector _operator_source_indices; std::vector _literals; }; diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 46507700e21..db04e1fe989 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v; * @param args Forwarded arguments to `operator()` of `f`. */ template -CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args) +CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op, + F&& f, + Ts&&... args) { switch (op) { case ast_operator::ADD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SUB: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::MUL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TRUE_DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::FLOOR_DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::MOD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::PYMOD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::POW: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NOT_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LESS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::GREATER: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LESS_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::GREATER_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_XOR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOGICAL_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_LOGICAL_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOGICAL_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_LOGICAL_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::IDENTITY: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::IS_NULL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SIN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::COS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TAN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCSIN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCCOS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCTAN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SINH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::COSH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TANH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCSINH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCCOSH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCTANH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::EXP: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOG: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SQRT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CBRT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CEIL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::FLOOR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ABS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::RINT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BIT_INVERT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NOT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_INT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_UINT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_FLOAT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid operator."); @@ -955,231 +907,6 @@ struct operator_functor { } }; -/** - * @brief Functor used to single-type-dispatch binary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are - * the same type, and dispatches based on the type of the left input. - * - * @tparam OperatorFunctor Binary operator functor. - */ -template -struct single_dispatch_binary_operator_types { - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation."); -#else - CUDF_UNREACHABLE("Invalid binary operation."); -#endif - } -}; - -/** - * @brief Functor performing a type dispatch for a binary operator. - * - * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true - * for all binary operators but holds for all currently implemented operators. - */ -struct type_dispatch_binary_op { - /** - * @brief Performs type dispatch for a binary operator. - * - * @tparam op AST operator. - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param lhs_type Type of left input data. - * @param rhs_type Type of right input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ - template - CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type, - cudf::data_type rhs_type, - F&& f, - Ts&&... args) - { - // Single dispatch (assume lhs_type == rhs_type) - type_dispatcher( - lhs_type, - // Always dispatch to the non-null operator for the purpose of type determination. - detail::single_dispatch_binary_operator_types>{}, - std::forward(f), - std::forward(args)...); - } -}; - -/** - * @brief Dispatches a runtime binary operator to a templated type dispatcher. - * - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param lhs_type Type of left input data. - * @param rhs_type Type of right input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ -template -CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher( - ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) -{ - ast_operator_dispatcher(op, - detail::type_dispatch_binary_op{}, - lhs_type, - rhs_type, - std::forward(f), - std::forward(args)...); -} - -/** - * @brief Functor used to type-dispatch unary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_unary_op` trait. - * - * @tparam OperatorFunctor Unary operator functor. - */ -template -struct dispatch_unary_operator_types { - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid unary operation."); -#else - CUDF_UNREACHABLE("Invalid unary operation."); -#endif - } -}; - -/** - * @brief Functor performing a type dispatch for a unary operator. - */ -struct type_dispatch_unary_op { - template - CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args) - { - type_dispatcher( - input_type, - // Always dispatch to the non-null operator for the purpose of type determination. - detail::dispatch_unary_operator_types>{}, - std::forward(f), - std::forward(args)...); - } -}; - -/** - * @brief Dispatches a runtime unary operator to a templated type dispatcher. - * - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param input_type Type of input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ -template -CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op, - cudf::data_type input_type, - F&& f, - Ts&&... args) -{ - ast_operator_dispatcher(op, - detail::type_dispatch_unary_op{}, - input_type, - std::forward(f), - std::forward(args)...); -} - -/** - * @brief Functor to determine the return type of an operator from its input types. - */ -struct return_type_functor { - /** - * @brief Callable for binary operators to determine return type. - * - * @tparam OperatorFunctor Operator functor to perform. - * @tparam LHS Left input type. - * @tparam RHS Right input type. - * @param result Reference whose value is assigned to the result data type. - */ - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { - using Out = cuda::std::invoke_result_t; - result = cudf::data_type(cudf::type_to_id()); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); -#else - CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined."); -#endif - } - - /** - * @brief Callable for unary operators to determine return type. - * - * @tparam OperatorFunctor Operator functor to perform. - * @tparam T Input type. - * @param result Pointer whose value is assigned to the result data type. - */ - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { - using Out = cuda::std::invoke_result_t; - result = cudf::data_type(cudf::type_to_id()); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); -#else - CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined."); -#endif - } -}; - /** * @brief Gets the return type of an AST operator. * @@ -1187,34 +914,8 @@ struct return_type_functor { * @param operand_types Vector of input types to the operator. * @return cudf::data_type Return type of the operator. */ -inline cudf::data_type ast_operator_return_type(ast_operator op, - std::vector const& operand_types) -{ - auto result = cudf::data_type(cudf::type_id::EMPTY); - switch (operand_types.size()) { - case 1: - unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result); - break; - case 2: - binary_operator_dispatcher( - op, operand_types[0], operand_types[1], detail::return_type_functor{}, result); - break; - default: CUDF_FAIL("Unsupported operator return type."); break; - } - return result; -} - -/** - * @brief Functor to determine the arity (number of operands) of an operator. - */ -struct arity_functor { - template - CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result) - { - // Arity is not dependent on null handling, so just use the false implementation here. - result = operator_functor::arity; - } -}; +cudf::data_type ast_operator_return_type(ast_operator op, + std::vector const& operand_types); /** * @brief Gets the arity (number of operands) of an AST operator. @@ -1222,12 +923,7 @@ struct arity_functor { * @param op Operator used to determine arity. * @return Arity of the operator. */ -CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op) -{ - auto result = cudf::size_type(0); - ast_operator_dispatcher(op, detail::arity_functor{}, result); - return result; -} +cudf::size_type ast_operator_arity(ast_operator op); } // namespace detail diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp index d0e4c59ca54..b2cc134d9fa 100644 --- a/cpp/src/ast/expression_parser.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,6 +161,7 @@ cudf::size_type expression_parser::visit(operation const& expr) auto const op = expr.get_operator(); auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types); _operators.push_back(op); + _operator_arities.push_back(cudf::ast::detail::ast_operator_arity(op)); // Push data reference auto const output = [&]() { if (expression_index == 0) { diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp new file mode 100644 index 00000000000..b60a69a42d9 --- /dev/null +++ b/cpp/src/ast/operators.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace ast { +namespace detail { +namespace { + +struct arity_functor { + template + void operator()(cudf::size_type& result) + { + // Arity is not dependent on null handling, so just use the false implementation here. + result = operator_functor::arity; + } +}; + +/** + * @brief Functor to determine the return type of an operator from its input types. + */ +struct return_type_functor { + /** + * @brief Callable for binary operators to determine return type. + * + * @tparam OperatorFunctor Operator functor to perform. + * @tparam LHS Left input type. + * @tparam RHS Right input type. + * @param result Pointer whose value is assigned to the result data type. + */ + template >* = nullptr> + void operator()(cudf::data_type& result) + { + using Out = cuda::std::invoke_result_t; + result = cudf::data_type{cudf::type_to_id()}; + } + + template >* = nullptr> + void operator()(cudf::data_type& result) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); +#else + CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined."); +#endif + result = cudf::data_type{cudf::type_id::EMPTY}; + } + + /** + * @brief Callable for unary operators to determine return type. + * + * @tparam OperatorFunctor Operator functor to perform. + * @tparam T Input type. + * @param result Pointer whose value is assigned to the result data type. + */ + template >* = nullptr> + void operator()(cudf::data_type& result) + { + using Out = cuda::std::invoke_result_t; + result = cudf::data_type{cudf::type_to_id()}; + } + + template >* = nullptr> + void operator()(cudf::data_type& result) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); +#else + CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined."); +#endif + result = cudf::data_type{cudf::type_id::EMPTY}; + } +}; + +/** + * @brief Functor used to single-type-dispatch binary operators. + * + * This functor's `operator()` is templated to validate calls to its operators based on the input + * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are + * the same type, and dispatches based on the type of the left input. + * + * @tparam OperatorFunctor Binary operator functor. + */ +template +struct single_dispatch_binary_operator_types { + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { + f.template operator()(std::forward(args)...); + } + + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid binary operation."); +#else + CUDF_UNREACHABLE("Invalid binary operation."); +#endif + } +}; + +/** + * @brief Functor performing a type dispatch for a binary operator. + * + * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true + * for all binary operators but holds for all currently implemented operators. + */ +struct type_dispatch_binary_op { + /** + * @brief Performs type dispatch for a binary operator. + * + * @tparam op AST operator. + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param lhs_type Type of left input data. + * @param rhs_type Type of right input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ + template + inline void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) + { + // Single dispatch (assume lhs_type == rhs_type) + type_dispatcher( + lhs_type, + // Always dispatch to the non-null operator for the purpose of type determination. + detail::single_dispatch_binary_operator_types>{}, + std::forward(f), + std::forward(args)...); + } +}; + +/** + * @brief Dispatches a runtime binary operator to a templated type dispatcher. + * + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param lhs_type Type of left input data. + * @param rhs_type Type of right input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ +template +inline constexpr void binary_operator_dispatcher( + ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) +{ + ast_operator_dispatcher(op, + detail::type_dispatch_binary_op{}, + lhs_type, + rhs_type, + std::forward(f), + std::forward(args)...); +} + +/** + * @brief Functor used to type-dispatch unary operators. + * + * This functor's `operator()` is templated to validate calls to its operators based on the input + * type, as determined by the `is_valid_unary_op` trait. + * + * @tparam OperatorFunctor Unary operator functor. + */ +template +struct dispatch_unary_operator_types { + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { + f.template operator()(std::forward(args)...); + } + + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid unary operation."); +#else + CUDF_UNREACHABLE("Invalid unary operation."); +#endif + } +}; + +/** + * @brief Functor performing a type dispatch for a unary operator. + */ +struct type_dispatch_unary_op { + template + inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args) + { + type_dispatcher( + input_type, + // Always dispatch to the non-null operator for the purpose of type determination. + detail::dispatch_unary_operator_types>{}, + std::forward(f), + std::forward(args)...); + } +}; + +/** + * @brief Dispatches a runtime unary operator to a templated type dispatcher. + * + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param input_type Type of input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ +template +inline constexpr void unary_operator_dispatcher(ast_operator op, + cudf::data_type input_type, + F&& f, + Ts&&... args) +{ + ast_operator_dispatcher(op, + detail::type_dispatch_unary_op{}, + input_type, + std::forward(f), + std::forward(args)...); +} + +} // namespace + +cudf::data_type ast_operator_return_type(ast_operator op, + std::vector const& operand_types) +{ + cudf::data_type result{cudf::type_id::EMPTY}; + switch (operand_types.size()) { + case 1: + unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result); + break; + case 2: + binary_operator_dispatcher( + op, operand_types[0], operand_types[1], detail::return_type_functor{}, result); + break; + default: CUDF_FAIL("Unsupported operator return type."); break; + } + return result; +} + +cudf::size_type ast_operator_arity(ast_operator op) +{ + cudf::size_type result{}; + ast_operator_dispatcher(op, arity_functor{}, result); + return result; +} + +} // namespace detail + +} // namespace ast + +} // namespace cudf From f84cd4316eaa61e231b5fd096608ca09d5e3c08c Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:26:43 -0500 Subject: [PATCH 10/12] [BUG] xfail Polars excel test (#17731) One the Polars tests fails when `fastexcel>=0.12.1`. I opened https://github.com/pola-rs/polars/issues/20698 to track that failing test. This PR xfail that test for now. xref #17677 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17731 --- python/cudf_polars/cudf_polars/testing/plugin.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index c16df320ceb..e453a8b89b9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -8,7 +8,9 @@ from functools import partialmethod from typing import TYPE_CHECKING +import fastexcel import pytest +from packaging import version import polars @@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None: ) -EXPECTED_FAILURES: Mapping[str, str] = { +EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = { "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", @@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None: # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", + "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": ( + "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698", + version.parse(fastexcel.__version__) >= version.parse("0.12.1"), + ), } @@ -219,4 +225,12 @@ def pytest_collection_modifyitems( if item.nodeid in TESTS_TO_SKIP: item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) elif item.nodeid in EXPECTED_FAILURES: + if isinstance(EXPECTED_FAILURES[item.nodeid], tuple): + # the second entry in the tuple is the condition to xfail on + item.add_marker( + pytest.mark.xfail( + condition=EXPECTED_FAILURES[item.nodeid][1], + reason=EXPECTED_FAILURES[item.nodeid][0], + ), + ) item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) From 253fb2f10e921519502e562672d29029e844c2cf Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:41:47 -0800 Subject: [PATCH 11/12] Require to implement `AutoCloseable` for the classes derived from `HostUDFWrapper` (#17727) This adds the requirement to implement `AutoCloseable` to the classes derived from `HostUDFWrapper`, forcing them to delete the native UDF instance upon class destruction. Doing so will fix the memory leak issue when the native UDF instance never being destroyed. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/17727 --- java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java index 0b6ecf2e140..124f2c99188 100644 --- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java +++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java @@ -24,8 +24,10 @@ *

* A new host UDF aggregation implementation must extend this class and override the * {@code hashCode} and {@code equals} methods for such purposes. + * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must + * also be overridden to automatically delete the native UDF instance upon class destruction. */ -public abstract class HostUDFWrapper { +public abstract class HostUDFWrapper implements AutoCloseable { public final long udfNativeHandle; public HostUDFWrapper(long udfNativeHandle) { From 847029172ce47ef2109dc825f149ab58130a2fc6 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Jan 2025 09:18:13 -0600 Subject: [PATCH 12/12] convert all nulls to nans in a specific scenario (#17677) Fixes: #17666 This PR ensures we convert all nulls to nan's in float columns only in pandas compatibility mode. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17677 --- python/cudf/cudf/core/column/column.py | 4 ++++ python/cudf/cudf/tests/test_series.py | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 30da8727366..19f2802553d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2413,7 +2413,11 @@ def as_column( and pa.types.is_integer(arbitrary.type) and arbitrary.null_count > 0 ): + # TODO: Need to re-visit this cast and fill_null + # calls while addressing the following issue: + # https://github.com/rapidsai/cudf/issues/14149 arbitrary = arbitrary.cast(pa.float64()) + arbitrary = pc.fill_null(arbitrary, np.nan) if ( cudf.get_option("default_integer_bitwidth") and pa.types.is_integer(arbitrary.type) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f8697c5c6b8..891c0ede9a4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import decimal import hashlib @@ -3003,3 +3003,12 @@ def test_dtype_dtypes_equal(): ser = cudf.Series([0]) assert ser.dtype is ser.dtypes assert ser.dtypes is ser.to_pandas().dtypes + + +def test_null_like_to_nan_pandas_compat(): + with cudf.option_context("mode.pandas_compatible", True): + ser = cudf.Series([1, 2, np.nan, 10, None]) + pser = pd.Series([1, 2, np.nan, 10, None]) + + assert pser.dtype == ser.dtype + assert_eq(ser, pser)