From 45a73291d4b9aa9f668405549ecce6e5df29eb7d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 6 Jan 2025 14:29:59 -0600 Subject: [PATCH 01/26] remove find_package(Python) in libcudf build (#17683) Nothing in `libcudf`'s CMake should need a Python interpreter or linking to Python components. This proposes removing the `find(Python)` there, to simplify that build: https://github.com/rapidsai/cudf/blob/955b1f4566abccf920a022dc78a1e654acf0de16/python/libcudf/CMakeLists.txt#L37-L38 Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17683 --- .github/CODEOWNERS | 7 +++---- python/libcudf/CMakeLists.txt | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5e2f46714d9..e0b315f34fc 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,10 +8,9 @@ notebooks/ @rapidsai/cudf-python-codeowners python/dask_cudf/ @rapidsai/cudf-dask-codeowners #cmake code owners -cpp/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -**/cmake/ @rapidsai/cudf-cmake-codeowners -*.cmake @rapidsai/cudf-cmake-codeowners +CMakeLists.txt @rapidsai/cudf-cmake-codeowners +**/cmake/ @rapidsai/cudf-cmake-codeowners +*.cmake @rapidsai/cudf-cmake-codeowners #java code owners java/ @rapidsai/cudf-java-codeowners diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 5f9a04d3cee..259492b98d1 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -34,9 +34,6 @@ endif() unset(cudf_FOUND) -# Find Python early so that later commands can use it -find_package(Python 3.10 REQUIRED COMPONENTS Interpreter) - set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) From b81d9e17fbffbb912e0128148f556bf7af41b6ab Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:25:05 -0800 Subject: [PATCH 02/26] Fix cudf.polars sum of empty not equalling zero (#17685) closes #17681 (We have a similar carve-out in cudf classic due to `sum([]) == 0` in Python) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17685 --- .../cudf_polars/dsl/expressions/aggregation.py | 14 +++++++++++++- python/cudf_polars/tests/expressions/test_agg.py | 8 +++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index b88b109a975..92f39abe71e 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -91,7 +91,7 @@ def __init__( op = partial(self._reduce, request=req) elif name in {"min", "max"}: op = partial(op, propagate_nans=options) - elif name in {"count", "first", "last"}: + elif name in {"count", "sum", "first", "last"}: pass else: raise NotImplementedError( @@ -180,6 +180,18 @@ def _count(self, column: Column) -> Column: ) ) + def _sum(self, column: Column) -> Column: + if column.obj.size() == 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(0, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + return self._reduce(column, request=plc.aggregation.sum()) + def _min(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: return Column( diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 86cb2352dcc..15ad845ea78 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -148,3 +148,9 @@ def test_agg_singleton(op): q = df.select(op(pl.col("a"))) assert_gpu_result_equal(q) + + +def test_sum_empty_zero(): + df = pl.LazyFrame({"a": pl.Series(values=[], dtype=pl.Int32())}) + q = df.select(pl.col("a").sum()) + assert_gpu_result_equal(q) From 71827451fa459460894a1e6a34217e815938a562 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 6 Jan 2025 15:30:52 -0800 Subject: [PATCH 03/26] Set default logger level to warn (#17684) This PR leverages rapidsai/rapids-logger#8 to set the default logging level to warn and updates an associated test. This PR also makes a CI script change to facilitate testing by ensuring that `RAPIDS_PY_CUDA_SUFFIX` is always defined before we would insert any download commands of files from linked CI. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17684 --- ci/build_wheel_libcudf.sh | 6 +++--- cpp/CMakeLists.txt | 2 +- cpp/tests/utilities_tests/logger_tests.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index af49942c8cd..d80e4fef0d0 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -1,11 +1,13 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -euo pipefail package_name="libcudf" package_dir="python/libcudf" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + rapids-logger "Generating build requirements" rapids-dependency-file-generator \ @@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cb814aa8c0f..af92b7ceaf5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) rapids_cpm_rapids_logger() -rapids_make_logger(cudf EXPORT_SET cudf-exports) +rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN) # find jitify include(cmake/thirdparty/get_jitify.cmake) diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index 58396115a54..b5d20325b75 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel) cudf::default_logger().warn("warn"); cudf::default_logger().error("error"); cudf::default_logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); + ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) From a0487be669326175982c8bfcdab4d61184c88e27 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 Jan 2025 17:45:33 -0800 Subject: [PATCH 04/26] Move unnecessary utilities from cudf._lib.scalar (#17636) In preparation for transitioning `DeviceScalar` to pylibcudf's `Scalar`, moving `_is_null_host_scalar` (a pure Python function) to `cudf.utils.utils` and removes `as_device_scalar` in favor of going through `cudf.Scalar` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17636 --- python/cudf/cudf/_lib/scalar.pyx | 23 --------------------- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 15 ++++++-------- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 8 +++++-- python/cudf/cudf/core/scalar.py | 4 ++-- python/cudf/cudf/utils/dtypes.py | 2 +- python/cudf/cudf/utils/utils.py | 9 ++++++++ 9 files changed, 28 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 40bd50acf16..fd6d0257940 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -260,26 +260,3 @@ cdef class DeviceScalar: self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (cdtype_id) ] - - -def as_device_scalar(val, dtype=None): - if isinstance(val, (cudf.Scalar, DeviceScalar)): - if dtype == val.dtype or dtype is None: - if isinstance(val, DeviceScalar): - return val - else: - return val.device_value - else: - raise TypeError("Can't update dtype of existing GPU scalar") - else: - return cudf.Scalar(val, dtype=dtype).device_value - - -def _is_null_host_scalar(slr): - if cudf.utils.utils.is_na_like(slr): - return True - elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - slr is pd.NaT: - return True - else: - return False diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b10b8dfe207..d705b4d4c21 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -621,7 +621,7 @@ def ordered(self) -> bool: def __setitem__(self, key, value): if cudf.api.types.is_scalar( value - ) and cudf._lib.scalar._is_null_host_scalar(value): + ) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31efe267c96..24b657f1c32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.scalar import as_device_scalar from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -71,7 +70,7 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import _array_ufunc, mask_dtype +from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype if TYPE_CHECKING: import builtins @@ -777,9 +776,7 @@ def fillna( if not self.has_nulls(include_nan=True): return self.copy() elif method is None: - if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( - fill_value - ): + if is_scalar(fill_value) and _is_null_host_scalar(fill_value): return self.copy() else: fill_value = self._validate_fillna_value(fill_value) @@ -1984,12 +1981,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - as_device_scalar( + cudf.Scalar( arbitrary.start, dtype=np.dtype(np.int64) - ).c_value, - as_device_scalar( + ).device_value.c_value, + cudf.Scalar( arbitrary.step, dtype=np.dtype(np.int64) - ).c_value, + ).device_value.c_value, ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d9440cdf21..6283e498842 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -236,7 +236,7 @@ def from_sequences( # Build Data, Mask & Offsets for data in arbitrary: - if cudf._lib.scalar._is_null_host_scalar(data): + if cudf.utils.utils._is_null_host_scalar(data): mask_col.append(False) offset_vals.append(offset) else: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4405e153b0c..8fe5299fcdd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -151,7 +151,7 @@ def __setitem__(self, key: Any, value: Any): cudf.Scalar( value, dtype=self.dtype - if cudf._lib.scalar._is_null_host_scalar(value) + if cudf.utils.utils._is_null_host_scalar(value) else None, ) if is_scalar(value) @@ -789,7 +789,7 @@ def _normalize_find_and_replace_input( ) # Scalar case if len(col_to_normalize) == 1: - if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): + if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]): return normalized_column.astype(input_column_dtype) if np.isinf(col_to_normalize[0]): return normalized_column diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3334b57ce1b..b2121511a14 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -92,7 +92,11 @@ min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +from cudf.utils.utils import ( + GetAttrGetItemMixin, + _external_only_api, + _is_null_host_scalar, +) if TYPE_CHECKING: from cudf._typing import ColumnLike, Dtype, NotImplementedType @@ -3371,7 +3375,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if isinstance(value, (np.ndarray, cupy.ndarray)): dtype = value.dtype value = value.item() - if libcudf.scalar._is_null_host_scalar(value): + if _is_null_host_scalar(value): dtype = "str" value = as_column( value, diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 80dd0921f9c..7d246960cc9 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -178,13 +178,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not cudf._lib.scalar._is_null_host_scalar(self._host_value) + return not cudf.utils.utils._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not cudf._lib.scalar._is_null_host_scalar(value) + valid = not cudf.utils.utils._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ca8f9cac2d0..31a8f4de3b3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -198,7 +198,7 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( + if cudf.utils.utils._is_null_host_scalar(val) or isinstance( val, cudf.Scalar ): return val diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c83c1cbe895..0adaaa60654 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -341,6 +341,15 @@ def is_na_like(obj): return obj is None or obj is cudf.NA or obj is cudf.NaT +def _is_null_host_scalar(slr) -> bool: + # slr is NA like or NaT like + return ( + is_na_like(slr) + or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) + or slr is pd.NaT + ) + + def _warn_no_dask_cudf(fn): @functools.wraps(fn) def wrapper(self): From f3081229379a7d92d7193a37a71bc43ad7a3d0fa Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 7 Jan 2025 10:20:27 -0600 Subject: [PATCH 05/26] Java Parquet reads via multiple host buffers (#17673) Adds a custom cuio datasource that can provide file data via multiple host memory buffers. This allows data that arrives from multiple threads in multiple buffers to be read directly rather than requiring the buffers to be concatenated into a single host memory buffer before reading. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Alessandro Bellina (https://github.com/abellina) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/17673 --- .../ai/rapids/cudf/ParquetChunkedReader.java | 59 +++++-- java/src/main/java/ai/rapids/cudf/Table.java | 44 +++++- java/src/main/native/CMakeLists.txt | 5 +- .../include/multi_host_buffer_source.hpp | 57 +++++++ java/src/main/native/src/ChunkedReaderJni.cpp | 58 ++++--- java/src/main/native/src/TableJni.cpp | 26 +-- .../native/src/multi_host_buffer_source.cpp | 148 ++++++++++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 41 ++++- 8 files changed, 390 insertions(+), 48 deletions(-) create mode 100644 java/src/main/native/include/multi_host_buffer_source.hpp create mode 100644 java/src/main/native/src/multi_host_buffer_source.cpp diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index 53af52eff07..5e544e92a77 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f * @param filePath Full path of the input Parquet file to read. */ public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) { - handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - + long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, HostMemoryBuffer buffer, long offset, long len) { - handle = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, - buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; + if (handle == 0) { + throw new IllegalStateException("Cannot create native chunked Parquet reader object."); + } + multiHostBufferSourceHandle = handles[1]; + } + /** + * Construct the reader instance from a read limit and data in host memory buffers. + * + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or + * 0 if there is no limit + * @param opts The options for Parquet reading. + * @param buffers Array of buffers containing the file data. The buffers are logically + * concatenated to construct the file being read. + */ + public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, + ParquetOptions opts, HostMemoryBuffer... buffers) { + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -181,6 +211,10 @@ public void close() { DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); dataSourceHandle = 0; } + if (multiHostBufferSourceHandle != 0) { + destroyMultiHostBufferSource(multiHostBufferSourceHandle); + multiHostBufferSourceHandle = 0; + } } @@ -196,6 +230,8 @@ public void close() { private long dataSourceHandle = 0; + private long multiHostBufferSourceHandle = 0; + /** * Create a native chunked Parquet reader object on heap and return its memory address. * @@ -206,13 +242,12 @@ public void close() { * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all. * @param binaryToString Whether to convert the corresponding column to String if it is binary. * @param filePath Full path of the file to read, or given as null if reading from a buffer. - * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer. - * @param length The length of the buffer to read from. + * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers. * @param timeUnit Return type of time unit for timestamps. */ - private static native long create(long chunkSizeByteLimit, long passReadLimit, - String[] filterColumnNames, boolean[] binaryToString, - String filePath, long bufferAddrs, long length, int timeUnit); + private static native long[] create(long chunkSizeByteLimit, long passReadLimit, + String[] filterColumnNames, boolean[] binaryToString, + String filePath, long[] bufferAddrsSizes, int timeUnit); private static native long createWithDataSource(long chunkedSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); @@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit, private static native long[] readChunk(long handle); private static native void close(long handle); + + private static native void destroyMultiHostBufferSource(long handle); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b01ce31b1f3..298f2cff6f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length, * all of them * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. - * @param address the address of the buffer to read from or 0 if we should not. - * @param length the length of the buffer to read from. + * @param addrsAndSizes the address and size pairs for every buffer or null for no buffers. * @param timeUnit return type of TimeStamp in units */ private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, - long address, long length, int timeUnit) throws CudfException; + long[] addrsAndSizes, int timeUnit) throws CudfException; private static native long[] readParquetFromDataSource(String[] filterColumnNames, boolean[] binaryToString, int timeUnit, @@ -1357,7 +1356,7 @@ public static Table readParquet(File path) { */ public static Table readParquet(ParquetOptions opts, File path) { return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); + path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId())); } /** @@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, } } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffer raw parquet formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) { return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } @@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); + } + + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated + * in order to construct the file being read. + * @return the data parsed as a table on the GPU. + */ + public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) { + assert buffers.length > 0; + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param ds custom datasource to provide the Parquet file data + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, DataSource ds) { long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); try { diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9ff43feeac6..bd1714aa476 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -156,8 +156,9 @@ add_library( src/ScalarJni.cpp src/TableJni.cpp src/aggregation128_utils.cu - src/maps_column_view.cu src/check_nvcomp_output_sizes.cu + src/maps_column_view.cu + src/multi_host_buffer_source.cpp ) # Disable NVTX if necessary diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp new file mode 100644 index 00000000000..2aedb2321e4 --- /dev/null +++ b/java/src/main/native/include/multi_host_buffer_source.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "jni_utils.hpp" + +#include + +#include + +namespace cudf { +namespace jni { + +/** + * @brief A custom datasource providing data from an array of host memory buffers. + */ +class multi_host_buffer_source : public cudf::io::datasource { + std::vector addrs_; + std::vector offsets_; + + size_t locate_offset_index(size_t offset); + + public: + explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes); + std::unique_ptr host_read(size_t offset, size_t size) override; + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; + bool supports_device_read() const override { return true; } + bool is_device_read_preferred(size_t size) const override { return true; } + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override; + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + size_t size() const override { return offsets_.back(); } +}; + +} // namespace jni +} // namespace cudf diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index cf04a87262f..4967e0b2b04 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -36,7 +37,7 @@ extern "C" { // This function should take all the parameters that `Table.readParquet` takes, // plus one more parameter `long chunkSizeByteLimit`. -JNIEXPORT jlong JNICALL +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jclass, jlong chunk_read_limit, @@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path, - jlong buffer, - jlong buffer_length, + jlongArray addrs_sizes, jint unit) { - JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr); bool read_buffer = true; - if (buffer == 0) { - JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0); + if (addrs_sizes == nullptr) { + JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr); read_buffer = false; } else if (inp_file_path != nullptr) { - JNI_THROW_NEW( - env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0); + JNI_THROW_NEW(env, + cudf::jni::ILLEGAL_ARG_CLASS, + "Cannot pass in both buffers and an inp_file_path", + nullptr); } try { cudf::jni::auto_set_device(env); cudf::jni::native_jstring filename(env, inp_file_path); if (!read_buffer && filename.is_empty()) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0); + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr); } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); @@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); (void)n_col_binary_read; - auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto opts_builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, auto const read_opts = opts_builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - - return reinterpret_cast( + n_addrs_sizes.cancel(); + n_col_binary_read.cancel(); + auto reader_handle = reinterpret_cast( new cudf::io::chunked_parquet_reader(static_cast(chunk_read_limit), static_cast(pass_read_limit), read_opts)); + cudf::jni::native_jlongArray result(env, 2); + result[0] = reader_handle; + result[1] = cudf::jni::release_as_jlong(multi_buffer_source); + return result.get_jArray(); } - CATCH_STD(env, 0); + CATCH_STD(env, nullptr); } JNIEXPORT jlong JNICALL @@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en CATCH_STD(env, ); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource( + JNIEnv* env, jclass, jlong handle) +{ + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + // // Chunked ORC reader JNI // diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ed35f35794d..a6c7ae9ba18 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -19,6 +19,7 @@ #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" #include "jni_writer_data_sink.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, - jlong buffer, - jlong buffer_length, + jlongArray addrs_and_sizes, jint unit) { JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; - if (buffer == 0) { + if (addrs_and_sizes == nullptr) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); read_buffer = false; } else if (inputfilepath != NULL) { JNI_THROW_NEW( env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL); } try { @@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); - - auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + auto tbl = cudf::io::read_parquet(opts).tbl; + n_col_binary_read.cancel(); + n_addrs_sizes.cancel(); + return convert_table_for_return(env, tbl); } CATCH_STD(env, NULL); } diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp new file mode 100644 index 00000000000..c577fc680ba --- /dev/null +++ b/java/src/main/native/src/multi_host_buffer_source.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "multi_host_buffer_source.hpp" + +#include +#include +#include +#include + +namespace cudf { +namespace jni { + +multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes) +{ + if (addrs_sizes.size() % 2 != 0) { + throw std::logic_error("addrs_sizes length not a multiple of 2"); + } + auto count = addrs_sizes.size() / 2; + addrs_.reserve(count); + offsets_.reserve(count + 1); + size_t total_size = 0; + for (int i = 0; i < addrs_sizes.size(); i += 2) { + addrs_.push_back(reinterpret_cast(addrs_sizes[i])); + offsets_.push_back(total_size); + total_size += addrs_sizes[i + 1]; + } + offsets_.push_back(total_size); +} + +size_t multi_host_buffer_source::locate_offset_index(size_t offset) +{ + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto start = offsets_.begin(); + auto it = std::upper_bound(start, offsets_.end(), offset); + return (it - start) - 1; +} + +std::unique_ptr multi_host_buffer_source::host_read(size_t offset, + size_t size) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto const end_offset = offset + size; + if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto next_offset = offsets_[buffer_index + 1]; + if (end_offset <= next_offset) { + // read range hits only a single buffer, so return a zero-copy view of the data + auto src = addrs_[buffer_index] + offset - offsets_[buffer_index]; + return std::make_unique(src, size); + } + auto buf = std::vector(size); + auto bytes_read = host_read(offset, size, buf.data()); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected host read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>>(std::move(buf)); +} + +size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + std::memcpy(dst, src, copy_size); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::unique_ptr multi_host_buffer_source::device_read( + size_t offset, size_t size, rmm::cuda_stream_view stream) +{ + rmm::device_buffer buf(size, stream); + auto dst = static_cast(buf.data()); + auto bytes_read = device_read(offset, size, dst, stream); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected device read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>(std::move(buf)); +} + +size_t multi_host_buffer_source::device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value())); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::future multi_host_buffer_source::device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + std::promise p; + p.set_value(device_read(offset, size, dst, stream)); + return p.get_future(); +} + +} // namespace jni +} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index c7fcb1756b6..7eb32892bad 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,11 @@ import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.StandardOpenOption; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -1714,6 +1717,42 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetHostBuffers() throws Exception { + long size = TEST_PARQUET_FILE_CHUNKED_READ.length(); + java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath(); + try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2); + HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) { + try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) { + ByteBuffer bb1 = buf1.asByteBuffer(); + while (bb1.hasRemaining()) { + if (channel.read(bb1) == -1) { + throw new EOFException("error reading first buffer"); + } + } + ByteBuffer bb2 = buf2.asByteBuffer(); + while (bb2.hasRemaining()) { + if (channel.read(bb2) == -1) { + throw new EOFException("error reading second buffer"); + } + } + } + ParquetOptions opts = ParquetOptions.DEFAULT; + try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + } + @Test void testChunkedReadParquetFromDataSource() throws IOException { try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); From caf97ef24ae814054e6b35c46e0633c7ce7b12ed Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Jan 2025 09:45:58 -0800 Subject: [PATCH 06/26] Add XXHash_32 hasher (#17533) Contributes to #17531 This PR introduces the xxhash_32 hasher to libcudf as a preparatory step for evaluating the impact of replacing murmurhash3_x86_32 with xxhash_32 as the default hash. Authors: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17533 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/hashing.hpp | 22 ++- cpp/include/cudf/hashing/detail/hashing.hpp | 7 +- cpp/include/cudf/hashing/detail/xxhash_32.cuh | 118 +++++++++++++++ cpp/src/hash/xxhash_32.cu | 136 ++++++++++++++++++ cpp/src/io/orc/dict_enc.cu | 3 +- cpp/src/io/parquet/chunk_dict.cu | 3 +- cpp/src/join/join_common_utils.cuh | 3 +- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/hashing/xxhash_32_test.cpp | 67 +++++++++ python/cudf/cudf/core/indexed_frame.py | 18 ++- python/cudf/cudf/tests/test_dataframe.py | 35 ++++- python/pylibcudf/pylibcudf/hashing.pxd | 6 +- python/pylibcudf/pylibcudf/hashing.pyi | 1 + python/pylibcudf/pylibcudf/hashing.pyx | 35 ++++- python/pylibcudf/pylibcudf/libcudf/hash.pxd | 7 +- .../pylibcudf/pylibcudf/tests/test_hashing.py | 43 +++--- 17 files changed, 473 insertions(+), 35 deletions(-) create mode 100644 cpp/include/cudf/hashing/detail/xxhash_32.cuh create mode 100644 cpp/src/hash/xxhash_32.cu create mode 100644 cpp/tests/hashing/xxhash_32_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af92b7ceaf5..9dabe4e8800 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -461,6 +461,7 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu + src/hash/xxhash_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/arrow_utilities.cpp diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 307a52cd242..88034b4f804 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,26 @@ std::unique_ptr sha512( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the XXHash_32 hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index 7cb80081a95..f796ff4526e 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,6 +61,11 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr xxhash_32(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::device_async_resource_ref mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh new file mode 100644 index 00000000000..bb6e7f18fbc --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_32 { + using result_type = std::uint32_t; + + CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } + + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const + { + return this->_impl.compute_hash(bytes, size); + } + + private: + template + __device__ constexpr result_type compute(T const& key) const + { + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + cuco::xxhash_32 _impl; +}; + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(bool const& key) const +{ + return this->compute(static_cast(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(float const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + double const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::string_view const& key) const +{ + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal128 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu new file mode 100644 index 00000000000..40503f7f911 --- /dev/null +++ b/cpp/src/hash/xxhash_32.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(column_device_view const& col, + size_type const row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } + auto const hasher = XXHash_32{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(column_device_view const&, + size_type const, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for XXHash_32"); + } + }; + + Nullate const _check_nulls; + table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column(data_type(type_to_id()), + input.num_rows(), + mask_state::UNALLOCATED, + stream, + mr); + + // Return early if there's nothing to hash + if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } + + bool const nullable = has_nulls(input); + auto const input_view = table_device_view::create(input, stream); + auto output_view = output->mutable_view(); + + // Compute the hash value for each row + thrust::tabulate(rmm::exec_policy(stream), + output_view.begin(), + output_view.end(), + device_row_hasher(nullable, *input_view, seed)); + + return output; +} + +} // namespace detail + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::xxhash_32(input, seed, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 7facc6497ed..469f933f918 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index b5f9b894c46..0d40a1f7b1b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 4f75908fe72..37c5698f654 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e5c29314203..344979e1288 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -192,6 +192,7 @@ ConfigureTest( hashing/sha256_test.cpp hashing/sha384_test.cpp hashing/sha512_test.cpp + hashing/xxhash_32_test.cpp hashing/xxhash_64_test.cpp ) diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp new file mode 100644 index 00000000000..9e3c66b0d0b --- /dev/null +++ b/cpp/tests/hashing/xxhash_32_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +class XXHash_32_Test : public cudf::test::BaseFixture {}; + +TEST_F(XXHash_32_Test, TestInteger) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{0, 42, 825}}; + auto constexpr seed = 0u; + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({148298089u, 1161967057u, 1066694813u}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, TestDouble) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{-8., 25., 90.}}; + auto constexpr seed = 42u; + + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({2276435783u, 3120212431u, 3454197470u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, StringType) +{ + auto col1 = cudf::test::strings_column_wrapper({"I", "am", "AI"}); + auto constexpr seed = 825u; + + auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({320624298u, 1612654309u, 1409499009u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6854cb02aa5..e9ed74f804b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -2836,16 +2836,22 @@ def hash_values( Parameters ---------- - method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' + method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3' Hash function to use: * murmur3: MurmurHash3 hash function - * md5: MD5 hash function + * xxhash32: xxHash32 hash function * xxhash64: xxHash64 hash function + * md5: MD5 hash function + * sha1: SHA-1 hash function + * sha224: SHA-224 hash function + * sha256: SHA-256 hash function + * sha384: SHA-384 hash function + * sha512: SHA-512 hash function seed : int, optional Seed value to use for the hash function. This parameter is only - supported for 'murmur3' and 'xxhash64'. + supported for 'murmur3', 'xxhash32', and 'xxhash64'. Returns @@ -2900,7 +2906,7 @@ def hash_values( 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ - seed_hash_methods = {"murmur3", "xxhash64"} + seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"} if seed is None: seed = 0 elif method not in seed_hash_methods: @@ -2914,6 +2920,8 @@ def hash_values( ) if method == "murmur3": plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash32": + plc_column = plc.hashing.xxhash_32(plc_table, seed) elif method == "xxhash64": plc_column = plc.hashing.xxhash_64(plc_table, seed) elif method == "md5": diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 11a9b398b50..f3cf8e36a5b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr import contextlib @@ -1440,6 +1440,7 @@ def test_assign_callable(mapping): "sha256", "sha384", "sha512", + "xxhash32", "xxhash64", ], ) @@ -1447,6 +1448,7 @@ def test_assign_callable(mapping): def test_dataframe_hash_values(nrows, method, seed): warning_expected = seed is not None and method not in { "murmur3", + "xxhash32", "xxhash64", } potential_warning = ( @@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed): "sha256": object, "sha384": object, "sha512": object, + "xxhash32": np.uint32, "xxhash64": np.uint64, } assert out.dtype == expected_dtypes[method] @@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed): assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) -@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) +@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) def test_dataframe_hash_values_seed(method): gdf = cudf.DataFrame() data = np.arange(10) @@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method): assert_neq(out_one, out_two) +def test_dataframe_hash_values_xxhash32(): + # xxhash32 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash32", seed=0) + expected_a = cudf.Series( + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash32", seed=42) + expected_b = cudf.Series( + [1076387279, 2261349915, 531498073, 650869264, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash32", seed=0) + expected_df = cudf.Series( + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_df, expected_df) + + def test_dataframe_hash_values_xxhash64(): # xxhash64 has no built-in implementation in Python and we don't want to # add a testing dependency, so we use regression tests against known good diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd index 2d070ddda69..fbd478f963f 100644 --- a/python/pylibcudf/pylibcudf/hashing.pxd +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t @@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128( uint64_t seed=* ) +cpdef Column xxhash_32( + Table input, + uint32_t seed=* +) cpdef Column xxhash_64( Table input, diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi index a849f5d0729..d535d842a18 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyi +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int] def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_32(input: Table, seed: int = ...) -> Column: ... def xxhash_64(input: Table, seed: int = ...) -> Column: ... def md5(input: Table) -> Column: ... def sha1(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 548cffc0ce8..1f093b20c6b 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport ( sha256 as cpp_sha256, sha384 as cpp_sha384, sha512 as cpp_sha512, + xxhash_32 as cpp_xxhash_32, xxhash_64 as cpp_xxhash_64, ) from pylibcudf.libcudf.table.table cimport table @@ -30,6 +31,7 @@ __all__ = [ "sha256", "sha384", "sha512", + "xxhash_32", "xxhash_64", ] @@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128( return Table.from_libcudf(move(c_result)) +cpdef Column xxhash_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + cpdef Column xxhash_64( Table input, uint64_t seed=DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 4e8a01b41a5..46fdf62cd6b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_32( + const table_view& input, + const uint32_t seed + ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 83fb50fa4ef..7096dbe14ff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import hashlib import struct @@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0): def hash_combine_32(lhs, rhs): - return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2))) + return np.uint32( + int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32 + ) def uint_hash_combine_32(lhs, rhs): @@ -80,22 +82,6 @@ def list_struct_table(): return data -def python_hash_value(x, method): - if method == "murmurhash3_x86_32": - return libcudf_mmh3_x86_32(x) - elif method == "murmurhash3_x64_128": - hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) - hasher.update(x) - # libcudf returns a tuple of two 64-bit integers - return hasher.utupledigest() - elif method == "xxhash_64": - return xxhash.xxh64( - x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - ).intdigest() - else: - return getattr(hashlib, method)(x).hexdigest() - - @pytest.mark.parametrize( "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] ) @@ -115,6 +101,23 @@ def py_hasher(val): assert_column_eq(got, expect) +def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh32( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.xxhash_32( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + assert_column_eq(got, expect) + + def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( @@ -125,7 +128,9 @@ def py_hasher(val): [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], type=pa.uint64(), ) - got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + got = plc.hashing.xxhash_64( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) assert_column_eq(got, expect) From 4e97cd44ef4838a20a641aee3eb1a0e59ec21491 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 7 Jan 2025 14:29:11 -0500 Subject: [PATCH 07/26] Fix the ORC decoding bug for the timestamp data (#17570) This PR introduces a band-aid class `run_cache_manager` to handle an exceptional case in TIMESTAMP data type, where the DATA stream (seconds) is processed ahead of SECONDARY stream (nanoseconds) and the excess rows are lost. The fix uses `run_cache_manager` (and also `cache_helper`, which is an implementation detail) to cache the potentially missed data from the DATA stream and let them be used in the next decoding iteration, thus preventing data loss. Closes #17155 Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) Approvers: - Matthew Murray (https://github.com/Matt711) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17570 --- cpp/src/io/orc/stripe_data.cu | 205 +++++++++++++++++- ...rcFile.timestamp.desynced.snappy.RLEv2.orc | Bin 0 -> 5832 bytes ....timestamp.desynced.uncompressed.RLEv2.orc | Bin 0 -> 5814 bytes python/cudf/cudf/tests/test_orc.py | 24 +- 4 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1572b7246c0..1f84d1f81dc 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,6 +132,177 @@ struct orcdec_state_s { } vals; }; +/** + * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group. + * + * This class is used to address a special case, where the first run of the DATA stream spans two + * adjacent row groups and its length is greater than the maximum length allowed to be consumed. + * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be + * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type + * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a + * local variable and does not reside in the shared memory. + */ +class run_cache_manager { + private: + enum class status : uint8_t { + DISABLED, ///< Run cache manager is disabled. No caching will be performed. If the special case + ///< happens, the run cache manager will be set to this status after the cache read + ///< is completed. This status also applies when the special case does not happen. + CAN_WRITE_TO_CACHE, ///< Run cache manager is ready for write. If the special case happens, the + ///< run cache manager will be set to this status. + CAN_READ_FROM_CACHE, ///< Run cache manager is ready for read. If the special case happens, the + ///< run cache manager will be set to this status after the cache write is + ///< completed. + }; + + public: + /** + * @brief Initialize the run cache manager. + * + * @param[in] s ORC decoder state. + */ + __device__ void initialize(orcdec_state_s* s) + { + _status = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP) + ? status::CAN_WRITE_TO_CACHE + : status::DISABLED; + _reusable_length = 0; + _run_length = 0; + } + + private: + status _status; ///< The status of the run cache manager. + uint32_t + _reusable_length; ///< The number of data to be cached and reused later. For example, if a run + ///< has a length of 512 but the maximum length allowed to be consumed is + ///< capped at 162, then 350 (512-162) data will be cached. + uint32_t _run_length; ///< The length of the run, 512 in the above example. + friend class cache_helper; +}; + +/** + * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for + * a row group. + * + * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is + * in the local storage (as an optimization). If a function is to use run_cache_manager, both the + * manager and the cache objects need to be passed. This class is introduced to simplify the + * function call, so that only a single cache_helper object needs to be passed. To that end, public + * methods originally belonging to run_cache_manager have been moved to this class. + */ +class cache_helper { + public: + /** + * @brief Constructor. + * + * @param[in] run_cache_manager_inst An instance of run_cache_manager. + */ + __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst) + : _manager(run_cache_manager_inst) + { + } + + /** + * @brief Set the reusable length object. + * + * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the + * DATA stream. + * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed + * by the decoder when processing the SECONDARY stream. + */ + __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length) + { + if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) { + _manager._run_length = run_length; + _manager._reusable_length = + (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0; + } + } + + /** + * @brief Adjust the maximum length allowed to be consumed when the length of the first run is + * greater than it. + * + * @param[in] max_length The maximum length allowed to be consumed for the DATA stream. + * @return A new maximum length. + */ + [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length) + { + auto new_max_length{max_length}; + if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) { + new_max_length -= _manager._reusable_length; + } + return new_max_length; + } + + /** + * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache. + * + * @param[in] src Intermediate buffer for the DATA stream. + */ + __device__ void write_to_cache(int64_t* src) + { + if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; } + + auto const tid = threadIdx.x; + + __syncthreads(); + + // All threads in the block always take a uniform code path for the following branches. + // _reusable_length ranges between [0, 512]. + if (_manager._reusable_length > 0) { + auto const length_to_skip = _manager._run_length - _manager._reusable_length; + if (tid < _manager._reusable_length) { + auto const src_idx = tid + length_to_skip; + _storage = src[src_idx]; + } + if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; } + } else { + if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; } + } + + __syncthreads(); + } + + /** + * @brief Copy the cached data to the intermediate buffer for the DATA stream. + * + * @param[in,out] dst Intermediate buffer for the DATA stream. + * @param[in,out] rle Run length decoder state object. + */ + __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle) + { + if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; } + + auto const tid = threadIdx.x; + + // First, shift the data up + auto const dst_idx = tid + _manager._reusable_length; + auto const v = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0; + __syncthreads(); + + if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; } + __syncthreads(); + + // Second, insert the cached data + if (tid < _manager._reusable_length) { dst[tid] = _storage; } + __syncthreads(); + + if (tid == 0) { + // Disable the run cache manager, since cache write-and-read happens at most once per row + // group. + _manager._status = run_cache_manager::status::DISABLED; + rle->num_vals += _manager._reusable_length; + } + + __syncthreads(); + } + + private: + run_cache_manager& _manager; ///< An instance of run_cache_manager. + int64_t _storage; ///< Per-thread cache storage. +}; + /** * @brief Initializes byte stream, modifying length and start position to keep the read pointer * 8-byte aligned. @@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { * @param[in] maxvals maximum number of values to decode * @param[in] t thread id * @param[in] has_buffered_values If true, means there are already buffered values + * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage + * caching of the first run of the DATA stream. * * @return number of values decoded */ @@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, T* vals, uint32_t maxvals, int t, - bool has_buffered_values = false) + bool has_buffered_values = false, + cache_helper* cache_helper_inst = nullptr) { if (t == 0) { + if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); } uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; auto numvals = 0; @@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, l += deltapos; } } + + if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); } + if ((numvals != 0) and (numvals + n > maxvals)) break; // case where there are buffered values and can't consume a whole chunk // from decoded values, so skip adding any more to buffer, work on buffered values and then @@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, __syncwarp(); } __syncthreads(); + // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the + // data type is int64_t. + if constexpr (cuda::std::is_same_v) { + if (cache_helper_inst != nullptr) { + // Run cache is read from during the 2nd iteration of the top-level while loop in + // gpuDecodeOrcColumnData(). + cache_helper_inst->read_from_cache(vals, rle); + // Run cache is written to during the 1st iteration of the loop. + cache_helper_inst->write_to_cache(vals); + } + } return rle->num_vals; } @@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Struct doesn't have any data in itself, so skip bool const is_valid = s->chunk.type_kind != STRUCT; size_t const max_num_rows = s->chunk.column_num_rows; + __shared__ run_cache_manager run_cache_manager_inst; + cache_helper cache_helper_inst(run_cache_manager_inst); if (t == 0 and is_valid) { // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { @@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); + + run_cache_manager_inst.initialize(s); } __syncthreads(); @@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (is_rlev1(s->chunk.encoding_kind)) { numvals = Integer_RLEv1(bs, &s->u.rlev1, s->vals.i64, numvals, t); } else { - numvals = Integer_RLEv2(bs, &s->u.rlev2, s->vals.i64, numvals, t); + numvals = Integer_RLEv2(bs, + &s->u.rlev2, + s->vals.i64, + numvals, + t, + false /**has_buffered_values */, + &cache_helper_inst); } if (s->chunk.type_kind == DECIMAL) { // If we're using an index, we may have to drop values from the initial run diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc new file mode 100644 index 0000000000000000000000000000000000000000..a0ea4fbbfc2fdc0fc9f7b1adbdf4ec6fef161de8 GIT binary patch literal 5832 zcmZ{oYjhjsoyPx>CCiKg1aV0iC^))nboEHG<&m!=$(BcwB~NI#6@h?&(hj(QFm@O& z2?7d?+omhnLz!?DblXnsBq#|9lN*Rz8CXIT3d!VJ5f*pA4JZM^gphTY5Bq7q{D04R z-{<+C_ni0JlcA$l07P%pwXB?<{XraU$->hB*#(FhX$IZASBw|WT~fwHD^H#LGvCo# zS+h~#JN*0hw9W;bNDC~w9^O3X$tL5ET8_3*7cDAoo!8pedg6`kYxoYwtNUC1FF)zX z!W961_p4`C&3^J~lV*;e=vrDx*N%fnNHZuyZVF5D{G*cmSuHDS&Hn3qc*Jk*~Eccj|UbEWg3xE0mN?Nd}g@{?GxFx3#GJ_U2Vc~`>JY^B|8^x4GN?YV% zi;}UZEg@vYiqckWiIup*O0{gt{lv=ro0a{kmHU~MKPx0$X%&}RrK_y+)mG)KP3ko^ zlMU_4EjIO5J93*HE!Z)` zW@5}v-C@s-+nEVFJHLaw)6TPY;V!#)w>3O}!(t&*XFuLnu?0yIFfP;F_ zk^8lSIeQEHu!DQV!E+8_(jlG`mVWDyf9FtEIn>`fk#k-~OHS-jC-In*`VVJr-rLOM zPWBH@?g=N)JB4#!7N2xVPdVkMoys##bpe4qhoNN*dmbZRz^Dac?msc+PZ;}WjQb15 zpGOF5FmVc#rZIUfrkp3L>s`o;F0|spHn<35gxciFz2ssxyVxx*&bXBqUBXtExXmSP zcgg2Rl$|d16*uy#8?Cyr^S2SNyQw$axi{U+TW(g9;@)=il3RGkExzlPF4!i&=T>&P z)%QKfZV!54H@3$^eBhz}=E;5NVJ_Ux?)7l{JiP1?_It#Ox}}32`H)9B>`{++k&AYq zbuad@mpJOBKJn%j_AsA%*%>eQ4==BHg@rrB&%M&DSN^A0Ip$Rt^&(&T(1s5??juet z4lml7``R1+#>bxYaerFHYjz0}tHkeo(rNRS?|sU}>$mh@{`+&+pa0dt|JJbg9lq6m z?D4JIFYdehl}vWwA?>#-pEz^Yv9njKIdhEx4&Cv}xOTl~*V1#ZnbhvZULBvme)RsS z%cI>bd#UR91?#6Dobp{eIsg4V(eX>y&ptHOn>Q@j%e_7p+U_R$V`7*@wM7 z)>#{Ubn2NK432%2G=4>8dS2JM8z$ZRn0LmPR%Rcc+8N#M-^aZ>e*Ff|6H~i>J{f%f zczXQi4Wp~44&P`9?~~pe>)tc{)YP#XC%g73yT2H+^pO zbWFx}kN;-l==0N;|I(1y-9(Q+wsHE8)4pF$retQ%SpNsJf12)P3|X1`VElzmo&}pS z%;Xia@VD`)O`~h3m&Lk&DoY=ZZ`?FJJ$=hflS|)U_|f>bO|$E!SKMs4W5{=rMro2J*r?zwqCvw!@+OS7A&cmB$7>u&SX z@uQnPFHi6K)#TWI;o#WNN26P(58q-~zF#^teq!^qanrF|Chy*_93KB}^X$&)GqLR} z4}^uAP{_W%RYRm)~Z1_yAQOOYfb2W3BJD$=@7cKAyOA%j~=z zy#>SX4{%2(EH8WBUYm)Xe(Zqo$%Omm(RbD^yWQ}I-O-;<1Ye&1ueG<_KKbMU<4yHURpIa`NBc$9}_8YbkEvn;@y8fNGTInh|_;tyY7z3sohI% znpi5%ezbPyxMBT4?(?x9@AvFmyK8)M<3V9|;^wWR7ru0O!m#C_^v{X0t&{(1`T8O3%dwvw82xzN<#!tXdWdRFJhpB6 zqRqZLC*M899G`e{+w7<7dgI&QKg6AwcwxKeAL}ygVr~g8o;fNO>T`}faF{{1j)ysE&(d65zJNC+Vt;;T5_^$3m-|4<( z*DpNObh2-UzxNOPbuICw_dcP1I{Vbp6&HVZWb@%e|9#u{pPqhv=}Hu4bkAP)UO_W+ z_Om_rEjs-Oryh)BWuC515AepB^m9EAq8H5k__?d+Uv|;VRnOh~Yjn}f4bMIOYaTuF+TV9PgkCyx$8(1s z;xC)I=eh4e(>n8D`Lc(3)68$nJ&&N4BX8BOeuTHpJXOAzLme}JEI&Q}X>4Y?yko_c z->g`DYUhz*DO;)UFtCg^lR?13SbJo0-u@TyN#9B`a?&TLo*yDq5>n zsp%c7Y^_@rYs0GAG&aO$w4qIVY?uwV5jN6B*=SqNmbWoY`)#bPXya@p8*eMy1Y5-> zHhpZ9Y&Dx~tJ@S?!=~CbcBJVuJ8C!CF*|N2?4+Ht)An4`aXVu#*jang&e=l;xT$qkI931a1Y@jJ(PCRlk?;~jHlpX zJw*@aDS3G9qaMLi@ra(PNAlD>vZwA*w5vU;SK~#zMlb3$c`+~UCA80bDKG8KdGlVz zTkx{pqL3cY76Y!>jr6yp3oh z+o(3WEvG%+#=5-!3ebUEARk}?g#a5U1~{E7zz50!Ay5g3foebs)B>_D5Ksb*fEv^Uk)Sb% z22DXs7YPzUGDrpKU@n*sGQmQS)kT9`uoUEj<)9F(1jS%ADCv?xIam)W!A4LGX+lWI z7((GUAuNQ4h!7c~LUbq>%7+-;l_54%3~`}Sh!2%RLZ}iFb=QTYP%R{f>LDf62&wIw zc0_kmJKAn)$J+6B;+8YePC`2c?KHIKKzBQ|Gtgdub{5);(9S`73EFvRFGITky1SrV zg!U@5OVD0}b{X31(5^sx1KL&4-47iIbQqxng$@&RV92LT-fE%tN>gVFAJw2#XM|LRf-u4Z)FGlkqyZ5Xnwy{tfi5F-q0nW5E)2SG=pvwtgf0r24bYW?t~_)x&{cpg7P^Yi z#X(mIx_D?lAG!qSsz8?rT~+9kpsNO5GIZ6UOM&J~pi6~r4Rj;WZG>+0pRb>UZVbBd zW2P<8Y=&+Mx@qXnL3bXynPaDyLpKZE#V@W5L$eFI`7gH2`32~%K(`3pRp^$WyY}Uy zbJ+lND_?#&=U1Ue)5wIO#|S+r^q3m&&SfLeLmYRWgdPfd=;IG?Ux%w;RkBYyRxIllzWN$8QGrw%;|^fbOYxdnPP(2IQiljYEhLi3WZ zH-@1Xhh74DN$91ZmxkWlH@`axy$m#8`OW8Z{vz~pC#g9<54~mR6;8f2r$uPK?ptdZ zdTY=te_K2Wy$bX;pjU-H4fG+l%Nm!O{qn1X%*`YX^cLVp$d zCFrj~zYP6#=%4%F)zGg3sR0rJ(g-99qzOn2NE}E4nx6-f0+I$Y2V@>d2FL=CERaPY zIcQ!3BoAa6NCC(SkRp&(ASEDcK+4d(0Z0YN29PR5H4sH0YJ?~XQ4>TlXx<7@0-_{D zDTvY#%|SE|Q3j#~h_cZ98bmpWmLSSQv6^Mxd z8zCk^tOhX|Vs(fq5Nkk8g}4Ud2sD2NaTMYvh+`1PAx=P?gg6Cp8sa%bLwGJ}OdcCa|e>0N{TV0lm&tPF~S)j?^nHYmfkL1nNp zs3tTCBw7QYH$caEhQvilS&Lr@x(Ir~<`OMT(KOmX@(DLz$B38_j- zOjT1-s+N-V8&XQDky6u|G?F%^(X=Uz>9?kdG?}K-bUK&Lr2!n?!RFg{ESlf%?7J)9fP4>S7x!|ZTzm>Vt) z^TXv~VYo6Z>OUTqhHJy}aD7-AZVam#O$O0_mO(S743@z&M25^z89I~GAI~tELWa#0 zGhC*W;WOoopg)xnGu4cgsb%C$J)>kA8MUP;i)4*iG;7LYSv*T*$t=}k$mX*7ER!u{ z*=#Y(WlLGU<@~IWtz^Y)H7jLnSvgzJDlL~})e+4IGGZJ-M@%Ew2tGo<2P4!7J(3&A zk1!*J5q6|F!nL?Y_>uC6Fj5&2N2(*zNNq%J35+NsjS-dB&lW zX*uf$@xy-N|KH!Ya(-f7UWa@A-ayOE3p9&XuGE?|FyCSKHy2MdHD&1N!v6ue ClTzIP literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc new file mode 100644 index 0000000000000000000000000000000000000000..8a7969cdbbb380dc92c2f5fc72642a990fb204f0 GIT binary patch literal 5814 zcmZ{oYj_*gnaAJAl4V8#f(RrGlsLL;boEHG<&m!=$(BcwWlw0g6@h?&whXv{Fm@O& z2_h62w@p{D4`sqt&}}=hlb|FdOl}};-*cYle0wu=6ww=XO)KXP!N0`urdhEV(ZcL7UN*Z=85gcRbNY{bTXT8sCV_AB zAK2GCYq2AZ3vYbuf~OjcKWIAML|wYDuX}bo zYtYQ{lYeo|>cii!Kl<>0`QAVI>f0Z^^-n%=?!1}VtC2a)sHPcfXePAH6l};gHZ%HW zwyBvrubH3Y7Yxnf+-B+gX8D3juXf<-Mv%tFO2S$&Y{x3CEdH(=o@i=f{mrYut0A`e=Wj74n;~RR_5QV?2oP7PptfTA>nGPxYR0LW0kM9D(7uhud^Z7+t8d1yTL}x>7#D4Wq)d8 zer98TZsU-bdB!H(ViRw*Nh3Dd(5l>KQ*XB;ci7Rq9W!hpM(xyH_UxFQ8Mm`@+qk>! zJZl&3v5WUw!*e%oz2B~^u&XN_$fu8>dmh0abP%f?)I*N!FCEPJTiHh(+@lVja|ja- z@q)1QYlr+BhqBtC{?>_H@CsUVVvjkA$DP!FIJ0x!VV-cZzjJa=I(gnHT=;>o^`782;_MTEn(OT81W)T%@ec#i7|h`*gs<2pD_L+LRgE5lbAGx$?GuXB2nGo zLSAyAWf!*5MHnO0W>@xQ7qi91Zgp|SZM^6bwzX~X8_dyu^z^pZ|&pNIIsL;b~* z{m{c)vV-04;SP9s*&`hEh?jOshduHUk8;$b9`hoX?nG-|>|-x++)I7p%`WI-KJ~KG zUhZ#RUhxVGc8Z^Sr5UgMcdv57t1j$DzVxAWA9m75oLU@SxGVd$H~fu{J?-QEu$tHG z5yn@G-}lqLJlUNy%-hesF&f(Id2}+9o0z{}cyrAC(#ZLnmfc`jxL`ey80ZJOXsysEP zWBpAN?gPy3v8CmiCnk4AclZx*?~dKL(evcwo}W$x-#?iiyLIEpn#rR-GlUOF?~Qiu zn|gZk#Lp%=4k&xZ?%g3TEyJceO`wKoA+rD{b{nUzE4c8q+_K(It^t?2+`qqgX4`K(#-hFxGqRr2a z7=C_`lE*%Hd1~|2`q+K99%K%V9eR0Y%hax47;fKdUOIMsi|3W8J-?V3Jt!O=9r$Qu z+tkt949gEnN5)QVnKEuZaofbb2bH5^-)@=NHFYkwW91>_*tmA9=e2bg-afHvZ{Ygz zxm!nGU$^KE!y|{N+Gu+J)SK&kcTD{15cBc)6JK5lu%^Uk_V?Cj%* zgiprZuZ-+ox9m>C@AgK2Iv#vw>c7_AcIU)Xhm=o8hY!r`S+`=;@Z4c!dORk2=D)mp zbmGOs*x$xe;>f;r&&E6dc$iYguM(&JvTprd6O((F+%mpYocU0?umDgFek^K+CKB?`tJCS z_m6O=#$Vjw`P=#oJMqEZJMS8w+%fXm`epYR_8*bH9=+!9)Xe(Z?wL4rMEPcX`;M6f zTUOj_ICc~{J^sc{&zI|0$Il)=iv45!-JK&R*FPKkXnOC1?Dz*e*M7Br{qm1KKgygL zy?+1NZ`SX+-|&Te*SF)xcX>cdeK&FHqr1+If4*zvj3B+$dHSgI-RR7Usk48g&T+&` zkF6N>teDZ>_u8UeUo`l(=}x@*ZS%4#7QCxF)pNFI*^LX1G@S0)>F@p>e?wDz>HSZt zpUynJbj4-g9@}#C$baAQ-KS@tSh^C0Y29;IzL(cbpZ{Ff0}Ido`I6_ZeSmK~w(Z#c z1+;0}`rI=G9+~z$x6^QB?sVw6BM@I@o2GwV z>UtEl9DBQV?W4SH`svdB9O{_8EQ}>@VJ!s<*Ra;YTS^weQnrYeibb+iEpo#~i(;u;RIA2{SdCWHYO-Pt z+pL6@v{F{unziPvj5Tj%;YKTGEn0bN$tqaOR?%9qN)5ZMvbAPataYnu)7TK3(S|nc zvtc&eM%YLjWut9bTh7Kb9JH~vf{n8kZM>~y6KrLh*zmDUvQ=%et!7hfb(?C}*pY_M z?5N#j$LzSBu#@TWyQ*z) zAP%Dgb(kEO19uP((m`nrj;tf+U>tb|>nJ!lN72EQO@x^gZ?d$o&o6P5RuBuCR)m)0J z?o!cbD9P_8zzBuDB(4)h)Yg zZpB@9tJ()Wh{xzbJthz4!99eB^ibLfPu7$3FrK`J^%Ojur|99ek9h=7*&})?9?4Vn z$ex-<(XR2RUX2&=8oj93& z@>abv-0M}mb+78v_z<7bhx$xDOuNlT_(&h+qkUOl&d2!jK34m>kMk9MyszXFd}W{L ztN0}CZlCO{`4nH>r}{O1#BcPY+I@b^kNXKf>8Jd(KkLu=8SOzo>o52@f6>qTOMby$ z_KVt&{gS`xm;E)r;;;MF7EKGH{j3FTF|}YVcni@&wool}OICZbg=xvRuq}lauBF(* zx0G50?U@#_rP3m`R9obhT8q+BZ&6_tj^IWd#Z5Sd<2ZqnIHfb-Sv-d`cphi*0?y$@ zoY!593wRk9@d_^CRb0kvxT3opR|A>=5-!3ebUUAQxZ)`2ZUz z1UQ{5zz0eJAy5v8fl5FMR0Fau5Ksd3fEv^Uk)Sb%22DXs7YPzUGDrpKU^bWwGQoV1 z)kT9`uo&cnrJxWj2gP6|DCv?xIamuS!Fo^)X+lWI7((G!AuNQ4h!7c~LUbq_%7qx+ z)gd-i2yvleh!2%QLZ}=PbvJ~hP&FinY9S?552>x1Rz!D8E81#m#ai)J;e; zJqT?Gv>Bldg*FqkVbF#{8v$)3v{9g&fVM2O<)Dp$wmh`4&{lvp4%&*)#)IxLXcM5V z3~eH`RiI6RwkovA&{l&s1$1kmO@*)q!U%+o5Jn+vf-nYQ9Kr;^;}E7GOhY&e;T(h+ z2fF%tN>YVFAKr2#XM|Kv;rs6~Z#;HbPi|a2>)bv}>Rpfp#ObqtI@G zb_{gepq+qr651(fr=dLy?Kx;?pgj-mEa+Z`b`IK$(9T193EBl{FGITs?GrWKtzH_6(TZ3Y7kK%Qiq5NjSbL&K!*`J zQ0Op02L>HDbP&)%LI(wn2I$B_M-Dm|=*U9{3mpaM;Gm-j9XvE%3>^Y=l%Yd}jtX>0 z&{2gB89Hjvp+Mv1(4j)7209VwG(so(_cu;MCkCDP3DZ_+G(#r^oiuc2p)&`a%!#wh zp_7Hq!WUPEq0t4M{1;ni{Q`8Bp;Ls;3Uo@)S^e^{*=zthl`p@X^{ddOsb|8_WrQvi zx=i(VXR{ILB2GF_Ll*^I^vQ>}LRSvD80gAF7YlF+baAJ0%b|;huF|PjXR~GK62JPv ztY3o0By`EpRf8@Cy6Rt@-U{6s=tjQ&(Q@cUp>fIAo5IkILpK54By>~IO+$C~o8O#< zZU!2!{^s*pe*wC=)6}e=hwc(|3#Z?n)gmyc~M7 z(3?AVe;9i6(91$^0eU&;EkZ93FbTZ^^p>Glgx(7DO3+({UKx68&^!CTYoJ#JQUfFc zq!CCINE474kT{S8G`;{N1tbk*7RVfs43K#sSs)8Qa?rRINFK-%kOGipAVnZ6KuSPX zfs~TxF&tt9#7Kxy z5ThZMg;)+^3^X2u7z?oi#5jl*A;v?j1Tg_(Wr&FYn;<4ZtO_w1Vl{{<5UWE>g}4Ud z2sC~MaTMYvh+`1PAx=P?gg6Cp8sb@KJPC0I;(3U(5HCQSgLo0*Jj6>77ohPB#6^f# zATB|?3UL|YHHa$^uR~k~eFO9%&}W1`6#7iihe00>eZ;agePkb{H}qxua(zr+zK`uI z^l^Q~K3;!upU_wC6Z0SMNf2m*SFZYZ6m42ze+AqWQex<+OuO>7JBw_BdS(O*5l4ipBsf#Lu^P#O>h$^)YQh5>1yIv@|!29$yNfJ$j7M1Ko~QYH$c zaEhQvilS&LtG|JmvXJDG#U!tPEGZ<*NikVTO37+cPS%o&eoazMX;Mham_k#g z6qdqMg#LvTm7-JGR4&D&@+me|NOAhLDLz$738`{QOjS}+s+yAZ8&gWEo>J4AG?F%^ z(X=Uz>9?hcG?}K-bUK^PrI~a-&FWuIbLnE5PnXg{x||l%m9(VaotD$Jw34o;)j`c5 zGH4t`;oU)O5FaE4$w6w69?TBr1{wXqL3Xe($PE?;`N7hlFjyWG^&byPgVjNKur{a+ z)(6#$CWGid%b*!k2Fu_XB12}V44uj9PiB}*KEq}T87@=I@R?FZ(4WbOnMy{=R5Nm> zmQgbGjM~&NgbW#n&>_JFwv%`g9uE{mb50{38 z;qtIJTp5;ztHW|rU|1Qh5396>Mrb3A(k2>fiqHg2(iBb8Svp5Ebe?9LqBKVrX`U|8 z0$rv>x8~;dB1U^b7yhw!tbzV+;M!q z>9VG|#yNA)xg&Qk8@+ktraMQkAGy659N-62Q`33hEC2EN#P`LC|G#+8%DIU-IUVlx ddjm~3&pv+TO07u)a~&3cW8qXoLxzqn_#X}@SVRB- literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c4b4ef60184..fe143e66407 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime import decimal @@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir): got = cudf.read_orc(buffer) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "inputfile", + [ + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", + ], +) +def test_orc_reader_desynced_timestamp(datadir, inputfile): + # Test a special case where the DATA stream (second) in a TIMESTAMP column + # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row + # group. In this case, the "run cache manager" in the decoder kernel is used to + # orchestrate the dual-stream processing. + # For more information, see https://github.com/rapidsai/cudf/issues/17155. + + path = datadir / inputfile + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_frame_equal(cudf.from_pandas(expect), got) From 30c6caa7a5dc5bb18dcba04c87a03a5343f78fd2 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 7 Jan 2025 14:15:02 -0600 Subject: [PATCH 08/26] Remove "legacy" Dask DataFrame support from Dask cuDF (#17558) The legacy Dask DataFrame API is deprecated. We should remove it for 25.02 to reduce maintenance burden. **Blockers**: - [x] https://github.com/rapidsai/dask-cuda/pull/1417 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - James Lamb (https://github.com/jameslamb) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17558 --- ci/test_python_other.sh | 13 +- ci/test_wheel_dask_cudf.sh | 16 +- python/dask_cudf/dask_cudf/__init__.py | 56 +- .../dask_cudf/dask_cudf/_expr/collection.py | 36 +- python/dask_cudf/dask_cudf/_expr/expr.py | 8 +- python/dask_cudf/dask_cudf/_expr/groupby.py | 266 ++++- python/dask_cudf/dask_cudf/_legacy/core.py | 711 -------------- python/dask_cudf/dask_cudf/_legacy/groupby.py | 909 ------------------ .../dask_cudf/_legacy/io/__init__.py | 12 +- python/dask_cudf/dask_cudf/_legacy/io/csv.py | 222 ----- python/dask_cudf/dask_cudf/_legacy/io/json.py | 209 ---- python/dask_cudf/dask_cudf/_legacy/io/orc.py | 195 ---- .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 64 +- python/dask_cudf/dask_cudf/_legacy/io/text.py | 56 -- python/dask_cudf/dask_cudf/_legacy/sorting.py | 361 ------- python/dask_cudf/dask_cudf/backends.py | 149 +-- python/dask_cudf/dask_cudf/core.py | 61 +- python/dask_cudf/dask_cudf/io/__init__.py | 15 +- python/dask_cudf/dask_cudf/io/csv.py | 12 +- python/dask_cudf/dask_cudf/io/json.py | 213 +++- python/dask_cudf/dask_cudf/io/orc.py | 208 +++- python/dask_cudf/dask_cudf/io/parquet.py | 16 +- .../dask_cudf/dask_cudf/io/tests/test_json.py | 10 +- .../dask_cudf/dask_cudf/io/tests/test_orc.py | 10 +- .../dask_cudf/io/tests/test_parquet.py | 39 +- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 5 +- .../dask_cudf/dask_cudf/io/tests/test_text.py | 10 +- python/dask_cudf/dask_cudf/io/text.py | 60 +- .../dask_cudf/tests/test_accessor.py | 4 +- python/dask_cudf/dask_cudf/tests/test_core.py | 104 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 69 +- .../dask_cudf/dask_cudf/tests/test_onehot.py | 8 +- python/dask_cudf/dask_cudf/tests/test_sort.py | 4 +- python/dask_cudf/dask_cudf/tests/utils.py | 35 +- python/dask_cudf/pyproject.toml | 9 +- 35 files changed, 864 insertions(+), 3311 deletions(-) delete mode 100644 python/dask_cudf/dask_cudf/_legacy/core.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/groupby.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/csv.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/json.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/orc.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/text.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/sorting.py diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index db86721755d..3c6dba72164 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -24,8 +24,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf (dask-expr)" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf" +./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -rapids-logger "pytest dask_cudf (legacy)" -DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . - rapids-logger "pytest cudf_kafka" ./ci/run_cudf_kafka_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index e15949f4bdb..44f430ce98d 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -eou pipefail @@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf (dask-expr)" +rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ +python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ . popd - -# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) -rapids-logger "pytest dask_cudf (legacy)" -pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 20eb2404b77..863102103ed 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,7 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from importlib import import_module +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import dask.dataframe as dd from dask import config @@ -9,11 +6,16 @@ import cudf -from . import backends # noqa: F401 +from . import backends, io # noqa: F401 +from ._expr.expr import _patch_dask_expr from ._version import __git_commit__, __version__ # noqa: F401 -from .core import DataFrame, Index, Series, concat, from_cudf +from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf -QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED +if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()): + raise ValueError( + "The legacy DataFrame API is not supported in dask_cudf>24.12. " + "Please enable query-planning, or downgrade to dask_cudf<=24.12" + ) def read_csv(*args, **kwargs): @@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def _deprecated_api(old_api, new_api=None, rec=None): - def inner_func(*args, **kwargs): - if new_api: - # Use alternative - msg = f"{old_api} is now deprecated. " - msg += rec or f"Please use {new_api} instead." - warnings.warn(msg, FutureWarning) - new_attr = new_api.split(".") - module = import_module(".".join(new_attr[:-1])) - return getattr(module, new_attr[-1])(*args, **kwargs) - - # No alternative - raise an error - raise NotImplementedError( - f"{old_api} is no longer supported. " + (rec or "") - ) - - return inner_func - - -if QUERY_PLANNING_ON: - from . import io - from ._expr.expr import _patch_dask_expr - - groupby_agg = _deprecated_api("dask_cudf.groupby_agg") - read_text = DataFrame.read_text - _patch_dask_expr() - -else: - from . import io # noqa: F401 - from ._legacy.groupby import groupby_agg # noqa: F401 - from ._legacy.io import read_text # noqa: F401 - - +groupby_agg = _deprecated_api("dask_cudf.groupby_agg") +read_text = DataFrame.read_text to_orc = _deprecated_api( "dask_cudf.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.to_orc", rec="Please use DataFrame.to_orc instead.", ) +_patch_dask_expr() + + __all__ = [ "DataFrame", "Index", diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 5192e6b8171..e8c9a970b7b 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import warnings from functools import cached_property @@ -15,19 +15,11 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.dataframe.dispatch import get_parallel_type from dask.typing import no_default import cudf -_LEGACY_WORKAROUND = ( - "To enable the 'legacy' dask-cudf API, set the " - "global 'dataframe.query-planning' config to " - "`False` before dask is imported. This can also " - "be done by setting an environment variable: " - "`DASK_DATAFRAME__QUERY_PLANNING=False` " -) - - ## ## Custom collection classes ## @@ -103,9 +95,8 @@ def set_index( divisions = None warnings.warn( "Ignoring divisions='quantile'. This option is now " - "deprecated. Please use the legacy API and raise an " - "issue on github if this feature is necessary." - f"\n{_LEGACY_WORKAROUND}", + "deprecated. Please raise an issue on github if this " + "feature is necessary.", FutureWarning, ) @@ -135,9 +126,7 @@ def groupby( if kwargs.pop("as_index") is not True: raise NotImplementedError( - f"{msg} Please reset the index after aggregating, or " - "use the legacy API if `as_index=False` is required.\n" - f"{_LEGACY_WORKAROUND}" + f"{msg} Please reset the index after aggregating." ) else: warnings.warn(msg, FutureWarning) @@ -153,15 +142,15 @@ def groupby( ) def to_orc(self, *args, **kwargs): - from dask_cudf._legacy.io import to_orc + from dask_cudf.io.orc import to_orc as to_orc_impl - return to_orc(self, *args, **kwargs) + return to_orc_impl(self, *args, **kwargs) @staticmethod def read_text(*args, **kwargs): - from dask_cudf._legacy.io.text import read_text as legacy_read_text + from dask_cudf.io.text import read_text as read_text_impl - return legacy_read_text(*args, **kwargs) + return read_text_impl(*args, **kwargs) def clip(self, lower=None, upper=None, axis=1): if axis not in (None, 1): @@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) +# dask.dataframe dispatch +get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) +get_parallel_type.register(cudf.Series, lambda _: Series) +get_parallel_type.register(cudf.BaseIndex, lambda _: Index) + + +# dask_expr dispatch (might go away?) get_collection_type.register(cudf.DataFrame, lambda _: DataFrame) get_collection_type.register(cudf.Series, lambda _: Series) get_collection_type.register(cudf.BaseIndex, lambda _: Index) diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py index 8b91e53604c..03d1da0d258 100644 --- a/python/dask_cudf/dask_cudf/_expr/expr.py +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import functools import dask_expr._shuffle as _shuffle_module @@ -7,13 +7,13 @@ from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var -from dask.dataframe.core import ( - is_dataframe_like, +from dask.dataframe.dispatch import ( + is_categorical_dtype, make_meta, meta_nonempty, ) -from dask.dataframe.dispatch import is_categorical_dtype from dask.typing import no_default +from dask.utils import is_dataframe_like import cudf diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py index 0242fac6e72..a5cdd43169b 100644 --- a/python/dask_cudf/dask_cudf/_expr/groupby.py +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd from dask_expr._collection import new_collection from dask_expr._groupby import ( @@ -16,11 +17,262 @@ from dask.dataframe.groupby import Aggregation from cudf.core.groupby.groupby import _deprecate_collect +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking ## ## Fused groupby aggregations ## +OPTIMIZED_AGGS = ( + "count", + "mean", + "std", + "var", + "sum", + "min", + "max", + list, + "first", + "last", +) + + +def _make_name(col_name, sep="_"): + """Combine elements of `col_name` into a single string, or no-op if + `col_name` is already a string + """ + if isinstance(col_name, str): + return col_name + return sep.join(name for name in col_name if name != "") + + +@_dask_cudf_performance_tracking +def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): + """Initial partition-level aggregation task. + + This is the first operation to be executed on each input + partition in `groupby_agg`. Depending on `aggs`, four possible + groupby aggregations ("count", "sum", "min", and "max") are + performed. The result is then partitioned (by hashing `gb_cols`) + into a number of distinct dictionary elements. The number of + elements in the output dictionary (`split_out`) corresponds to + the number of partitions in the final output of `groupby_agg`. + """ + + # Modify dict for initial (partition-wise) aggregations + _agg_dict = {} + for col, agg_list in aggs.items(): + _agg_dict[col] = set() + for agg in agg_list: + if agg in ("mean", "std", "var"): + _agg_dict[col].add("count") + _agg_dict[col].add("sum") + else: + _agg_dict[col].add(agg) + _agg_dict[col] = list(_agg_dict[col]) + if set(agg_list).intersection({"std", "var"}): + pow2_name = _make_name((col, "pow2"), sep=sep) + df[pow2_name] = df[col].astype("float64").pow(2) + _agg_dict[pow2_name] = ["sum"] + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + _agg_dict + ) + output_columns = [_make_name(name, sep=sep) for name in gb.columns] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _tree_node_agg(df, gb_cols, dropna, sort, sep): + """Node in groupby-aggregation reduction tree. + + The input DataFrame (`df`) corresponds to the + concatenated output of one or more `_groupby_partition_agg` + tasks. In this function, "sum", "min" and/or "max" groupby + aggregations will be used to combine the statistics for + duplicate keys. + """ + + agg_dict = {} + for col in df.columns: + if col in gb_cols: + continue + agg = col.split(sep)[-1] + if agg in ("count", "sum"): + agg_dict[col] = ["sum"] + elif agg == "list": + agg_dict[col] = [list] + elif agg in OPTIMIZED_AGGS: + agg_dict[col] = [agg] + else: + raise ValueError(f"Unexpected aggregation: {agg}") + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + agg_dict + ) + + # Don't include the last aggregation in the column names + output_columns = [ + _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) + for name in gb.columns + ] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): + """Calculate variance (given count, sum, and sum-squared columns).""" + + # Select count, sum, and sum-squared + n = df[count_name] + x = df[sum_name] + x2 = df[pow2_sum_name] + + # Use sum-squared approach to get variance + var = x2 - x**2 / n + div = n - ddof + div[div < 1] = 1 # Avoid division by 0 + var /= div + + # Set appropriate NaN elements + # (since we avoided 0-division) + var[(n - ddof) == 0] = np.nan + + return var + + +@_dask_cudf_performance_tracking +def _finalize_gb_agg( + gb_in, + gb_cols, + aggs, + columns, + final_columns, + as_index, + dropna, + sort, + sep, + str_cols_out, + aggs_renames, +): + """Final aggregation task. + + This is the final operation on each output partitions + of the `groupby_agg` algorithm. This function must + take care of higher-order aggregations, like "mean", + "std" and "var". We also need to deal with the column + index, the row index, and final sorting behavior. + """ + + gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) + + # Deal with higher-order aggregations + for col in columns: + agg_list = aggs.get(col, []) + agg_set = set(agg_list) + if agg_set.intersection({"mean", "std", "var"}): + count_name = _make_name((col, "count"), sep=sep) + sum_name = _make_name((col, "sum"), sep=sep) + if agg_set.intersection({"std", "var"}): + pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) + var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) + if "var" in agg_list: + name_var = _make_name((col, "var"), sep=sep) + gb[name_var] = var + if "std" in agg_list: + name_std = _make_name((col, "std"), sep=sep) + gb[name_std] = np.sqrt(var) + gb.drop(columns=[pow2_sum_name], inplace=True) + if "mean" in agg_list: + mean_name = _make_name((col, "mean"), sep=sep) + gb[mean_name] = gb[sum_name] / gb[count_name] + if "sum" not in agg_list: + gb.drop(columns=[sum_name], inplace=True) + if "count" not in agg_list: + gb.drop(columns=[count_name], inplace=True) + if list in agg_list: + collect_name = _make_name((col, "list"), sep=sep) + gb[collect_name] = gb[collect_name].list.concat() + + # Ensure sorted keys if `sort=True` + if sort: + gb = gb.sort_values(gb_cols) + + # Set index if necessary + if as_index: + gb.set_index(gb_cols, inplace=True) + + # Unflatten column names + col_array = [] + agg_array = [] + for col in gb.columns: + if col in gb_cols: + col_array.append(col) + agg_array.append("") + else: + name, agg = col.split(sep) + col_array.append(name) + agg_array.append(aggs_renames.get((name, agg), agg)) + if str_cols_out: + gb.columns = col_array + else: + gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + + return gb[final_columns] + + +@_dask_cudf_performance_tracking +def _redirect_aggs(arg): + """Redirect aggregations to their corresponding name in cuDF""" + redirects = { + sum: "sum", + max: "max", + min: "min", + "collect": list, + "list": list, + } + if isinstance(arg, dict): + new_arg = dict() + for col in arg: + if isinstance(arg[col], list): + new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] + elif isinstance(arg[col], dict): + new_arg[col] = { + k: redirects.get(v, v) for k, v in arg[col].items() + } + else: + new_arg[col] = redirects.get(arg[col], arg[col]) + return new_arg + if isinstance(arg, list): + return [redirects.get(agg, agg) for agg in arg] + return redirects.get(arg, arg) + + +@_dask_cudf_performance_tracking +def _aggs_optimized(arg, supported: set): + """Check that aggregations in `arg` are a subset of `supported`""" + if isinstance(arg, (list, dict)): + if isinstance(arg, dict): + _global_set: set[str] = set() + for col in arg: + if isinstance(arg[col], list): + _global_set = _global_set.union(set(arg[col])) + elif isinstance(arg[col], dict): + _global_set = _global_set.union(set(arg[col].values())) + else: + _global_set.add(arg[col]) + else: + _global_set = set(arg) + + return bool(_global_set.issubset(supported)) + elif isinstance(arg, (str, type)): + return arg in supported + return False + def _get_spec_info(gb): if isinstance(gb.arg, (dict, list)): @@ -105,20 +357,14 @@ def shuffle_by_index(self): @classmethod def chunk(cls, df, *by, **kwargs): - from dask_cudf._legacy.groupby import _groupby_partition_agg - return _groupby_partition_agg(df, **kwargs) @classmethod def combine(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _tree_node_agg - return _tree_node_agg(_concat(inputs), **kwargs) @classmethod def aggregate(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _finalize_gb_agg - return _finalize_gb_agg(_concat(inputs), **kwargs) @property @@ -193,12 +439,6 @@ def _maybe_get_custom_expr( shuffle_method=None, **kwargs, ): - from dask_cudf._legacy.groupby import ( - OPTIMIZED_AGGS, - _aggs_optimized, - _redirect_aggs, - ) - if kwargs: # Unsupported key-word arguments return None diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py deleted file mode 100644 index d6beb775a5e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/core.py +++ /dev/null @@ -1,711 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import math -import warnings - -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname - -import cudf -from cudf import _lib as libcudf -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._expr.accessors import ListMethods, StructMethods -from dask_cudf._legacy import sorting -from dask_cudf._legacy.sorting import ( - _deprecate_shuffle_kwarg, - _get_shuffle_method, -) - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf._legacy.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf._legacy.io.to_orc""" - from dask_cudf._legacy.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, - ) - - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py deleted file mode 100644 index 7e01e91476d..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/groupby.py +++ /dev/null @@ -1,909 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from functools import wraps - -import numpy as np -import pandas as pd - -from dask.dataframe.core import ( - DataFrame as DaskDataFrame, - aca, - split_out_on_cols, -) -from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy -from dask.utils import funcname - -import cudf -from cudf.core.groupby.groupby import _deprecate_collect -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg - -# aggregations that are dask-cudf optimized -OPTIMIZED_AGGS = ( - "count", - "mean", - "std", - "var", - "sum", - "min", - "max", - list, - "first", - "last", -) - - -def _check_groupby_optimized(func): - """ - Decorator for dask-cudf's groupby methods that returns the dask-cudf - optimized method if the groupby object is supported, otherwise - reverting to the upstream Dask method - """ - - @wraps(func) - def wrapper(*args, **kwargs): - gb = args[0] - if _groupby_optimized(gb): - return func(*args, **kwargs) - # note that we use upstream Dask's default kwargs for this call if - # none are specified; this shouldn't be an issue as those defaults are - # consistent with dask-cudf - return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs) - - return wrapper - - -class CudfDataFrameGroupBy(DataFrameGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - def __getitem__(self, key): - if isinstance(key, list): - g = CudfDataFrameGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - else: - g = CudfSeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - - g._meta = g._meta[key] - return g - - @_dask_cudf_performance_tracking - def _make_groupby_method_aggs(self, agg_name): - """Create aggs dictionary for aggregation methods""" - - if isinstance(self.by, list): - return {c: agg_name for c in self.obj.columns if c not in self.by} - return {c: agg_name for c in self.obj.columns if c != self.by} - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("count"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("mean"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("std"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("var"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("sum"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("min"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("max"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs(list), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("first"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("last"), - split_every, - split_out, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - if isinstance(self._meta.grouping.keys, cudf.MultiIndex): - keys = self._meta.grouping.keys.names - else: - keys = self._meta.grouping.keys.name - - return groupby_agg( - self.obj, - keys, - arg, - split_every=split_every, - split_out=split_out, - sep=self.sep, - sort=self.sort, - as_index=self.as_index, - shuffle_method=shuffle_method, - **self.dropna, - ) - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -class CudfSeriesGroupBy(SeriesGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "count"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "mean"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "std"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "var"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "sum"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "min"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "max"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - {self._slice: list}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "first"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "last"}, - split_every, - split_out, - )[self._slice] - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if not isinstance(arg, dict): - arg = {self._slice: arg} - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - return _make_groupby_agg_call( - self, arg, split_every, split_out, shuffle_method - )[self._slice] - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -def _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token=None, - sort=None, - shuffle_method=None, -): - # Shuffle-based groupby aggregation - # NOTE: This function is the dask_cudf version of - # dask.dataframe.groupby._shuffle_aggregate - - # Step 1 - Chunkwise groupby operation - chunk_name = f"{token or funcname(chunk)}-chunk" - chunked = ddf.map_partitions( - chunk, - meta=chunk(ddf._meta, **chunk_kwargs), - token=chunk_name, - **chunk_kwargs, - ) - - # Step 2 - Perform global sort or shuffle - shuffle_npartitions = max( - chunked.npartitions // split_every, - split_out, - ) - if sort and split_out > 1: - # Sort-based code path - result = ( - chunked.repartition(npartitions=shuffle_npartitions) - .sort_values( - gb_cols, - ignore_index=True, - shuffle_method=shuffle_method, - ) - .map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - ) - else: - # Hash-based code path - result = chunked.shuffle( - gb_cols, - npartitions=shuffle_npartitions, - ignore_index=True, - shuffle_method=shuffle_method, - ).map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - - # Step 3 - Repartition and return - if split_out < result.npartitions: - return result.repartition(npartitions=split_out) - return result - - -@_dask_cudf_performance_tracking -def groupby_agg( - ddf, - gb_cols, - aggs_in, - split_every=None, - split_out=None, - dropna=True, - sep="___", - sort=False, - as_index=True, - shuffle_method=None, -): - """Optimized groupby aggregation for Dask-CuDF. - - Parameters - ---------- - ddf : DataFrame - DataFrame object to perform grouping on. - gb_cols : str or list[str] - Column names to group by. - aggs_in : str, list, or dict - Aggregations to perform. - split_every : int (optional) - How to group intermediate aggregates. - dropna : bool - Drop grouping key values corresponding to NA values. - as_index : bool - Currently ignored. - sort : bool - Sort the group keys, better performance is obtained when - not sorting. - shuffle_method : str (optional) - Control how shuffling of the DataFrame is performed. - sep : str - Internal usage. - - - Notes - ----- - This "optimized" approach is more performant than the algorithm in - implemented in :meth:`DataFrame.apply` because it allows the cuDF - backend to perform multiple aggregations at once. - - This aggregation algorithm only supports the following options - - * "list" - * "count" - * "first" - * "last" - * "max" - * "mean" - * "min" - * "std" - * "sum" - * "var" - - - See Also - -------- - DataFrame.groupby : generic groupby of a DataFrame - dask.dataframe.apply_concat_apply : for more description of the - split_every argument. - - """ - # Assert that aggregations are supported - aggs = _redirect_aggs(aggs_in) - if not _aggs_optimized(aggs, OPTIMIZED_AGGS): - raise ValueError( - f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. " - f"Aggregations must be specified with dict or list syntax." - ) - - # If split_every is False, we use an all-to-one reduction - if split_every is False: - split_every = max(ddf.npartitions, 2) - - # Deal with default split_out and split_every params - split_every = split_every or 8 - split_out = split_out or 1 - - # Standardize `gb_cols`, `columns`, and `aggs` - if isinstance(gb_cols, str): - gb_cols = [gb_cols] - columns = [c for c in ddf.columns if c not in gb_cols] - if not isinstance(aggs, dict): - aggs = {col: aggs for col in columns} - - # Assert if our output will have a MultiIndex; this will be the case if - # any value in the `aggs` dict is not a string (i.e. multiple/named - # aggregations per column) - str_cols_out = True - aggs_renames = {} - for col in aggs: - if isinstance(aggs[col], str) or callable(aggs[col]): - aggs[col] = [aggs[col]] - elif isinstance(aggs[col], dict): - str_cols_out = False - col_aggs = [] - for k, v in aggs[col].items(): - aggs_renames[col, v] = k - col_aggs.append(v) - aggs[col] = col_aggs - else: - str_cols_out = False - if col in gb_cols: - columns.append(col) - - # Construct meta - _aggs = aggs.copy() - if str_cols_out: - # Metadata should use `str` for dict values if that is - # what the user originally specified (column names will - # be str, rather than tuples). - for col in aggs: - _aggs[col] = _aggs[col][0] - _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) - if aggs_renames: - col_array = [] - agg_array = [] - for col, agg in _meta.columns: - col_array.append(col) - agg_array.append(aggs_renames.get((col, agg), agg)) - _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - chunk = _groupby_partition_agg - chunk_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - combine = _tree_node_agg - combine_kwargs = { - "gb_cols": gb_cols, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - aggregate = _finalize_gb_agg - aggregate_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "final_columns": _meta.columns, - "as_index": as_index, - "dropna": dropna, - "sort": sort, - "sep": sep, - "str_cols_out": str_cols_out, - "aggs_renames": aggs_renames, - } - - # Use shuffle_method=True for split_out>1 - if sort and split_out > 1 and shuffle_method is None: - shuffle_method = "tasks" - - # Check if we are using the shuffle-based algorithm - if shuffle_method: - # Shuffle-based aggregation - return _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token="cudf-aggregate", - sort=sort, - shuffle_method=shuffle_method - if isinstance(shuffle_method, str) - else None, - ) - - # Deal with sort/shuffle defaults - if split_out > 1 and sort: - raise ValueError( - "dask-cudf's groupby algorithm does not yet support " - "`sort=True` when `split_out>1`, unless a shuffle-based " - "algorithm is used. Please use `split_out=1`, group " - "with `sort=False`, or set `shuffle_method=True`." - ) - - # Determine required columns to enable column projection - required_columns = list( - set(gb_cols).union(aggs.keys()).intersection(ddf.columns) - ) - - return aca( - [ddf[required_columns]], - chunk=chunk, - chunk_kwargs=chunk_kwargs, - combine=combine, - combine_kwargs=combine_kwargs, - aggregate=aggregate, - aggregate_kwargs=aggregate_kwargs, - token="cudf-aggregate", - split_every=split_every, - split_out=split_out, - split_out_setup=split_out_on_cols, - split_out_setup_kwargs={"cols": gb_cols}, - sort=sort, - ignore_index=True, - ) - - -@_dask_cudf_performance_tracking -def _make_groupby_agg_call( - gb, aggs, split_every, split_out, shuffle_method=None -): - """Helper method to consolidate the common `groupby_agg` call for all - aggregations in one place - """ - - return groupby_agg( - gb.obj, - gb.by, - aggs, - split_every=split_every, - split_out=split_out, - sep=gb.sep, - sort=gb.sort, - as_index=gb.as_index, - shuffle_method=shuffle_method, - **gb.dropna, - ) - - -@_dask_cudf_performance_tracking -def _redirect_aggs(arg): - """Redirect aggregations to their corresponding name in cuDF""" - redirects = { - sum: "sum", - max: "max", - min: "min", - "collect": list, - "list": list, - } - if isinstance(arg, dict): - new_arg = dict() - for col in arg: - if isinstance(arg[col], list): - new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] - elif isinstance(arg[col], dict): - new_arg[col] = { - k: redirects.get(v, v) for k, v in arg[col].items() - } - else: - new_arg[col] = redirects.get(arg[col], arg[col]) - return new_arg - if isinstance(arg, list): - return [redirects.get(agg, agg) for agg in arg] - return redirects.get(arg, arg) - - -@_dask_cudf_performance_tracking -def _aggs_optimized(arg, supported: set): - """Check that aggregations in `arg` are a subset of `supported`""" - if isinstance(arg, (list, dict)): - if isinstance(arg, dict): - _global_set: set[str] = set() - for col in arg: - if isinstance(arg[col], list): - _global_set = _global_set.union(set(arg[col])) - elif isinstance(arg[col], dict): - _global_set = _global_set.union(set(arg[col].values())) - else: - _global_set.add(arg[col]) - else: - _global_set = set(arg) - - return bool(_global_set.issubset(supported)) - elif isinstance(arg, (str, type)): - return arg in supported - return False - - -@_dask_cudf_performance_tracking -def _groupby_optimized(gb): - """Check that groupby input can use dask-cudf optimized codepath""" - return isinstance(gb.obj, DaskDataFrame) and ( - isinstance(gb.by, str) - or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by)) - ) - - -def _make_name(col_name, sep="_"): - """Combine elements of `col_name` into a single string, or no-op if - `col_name` is already a string - """ - if isinstance(col_name, str): - return col_name - return sep.join(name for name in col_name if name != "") - - -@_dask_cudf_performance_tracking -def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): - """Initial partition-level aggregation task. - - This is the first operation to be executed on each input - partition in `groupby_agg`. Depending on `aggs`, four possible - groupby aggregations ("count", "sum", "min", and "max") are - performed. The result is then partitioned (by hashing `gb_cols`) - into a number of distinct dictionary elements. The number of - elements in the output dictionary (`split_out`) corresponds to - the number of partitions in the final output of `groupby_agg`. - """ - - # Modify dict for initial (partition-wise) aggregations - _agg_dict = {} - for col, agg_list in aggs.items(): - _agg_dict[col] = set() - for agg in agg_list: - if agg in ("mean", "std", "var"): - _agg_dict[col].add("count") - _agg_dict[col].add("sum") - else: - _agg_dict[col].add(agg) - _agg_dict[col] = list(_agg_dict[col]) - if set(agg_list).intersection({"std", "var"}): - pow2_name = _make_name((col, "pow2"), sep=sep) - df[pow2_name] = df[col].astype("float64").pow(2) - _agg_dict[pow2_name] = ["sum"] - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - _agg_dict - ) - output_columns = [_make_name(name, sep=sep) for name in gb.columns] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _tree_node_agg(df, gb_cols, dropna, sort, sep): - """Node in groupby-aggregation reduction tree. - - The input DataFrame (`df`) corresponds to the - concatenated output of one or more `_groupby_partition_agg` - tasks. In this function, "sum", "min" and/or "max" groupby - aggregations will be used to combine the statistics for - duplicate keys. - """ - - agg_dict = {} - for col in df.columns: - if col in gb_cols: - continue - agg = col.split(sep)[-1] - if agg in ("count", "sum"): - agg_dict[col] = ["sum"] - elif agg == "list": - agg_dict[col] = [list] - elif agg in OPTIMIZED_AGGS: - agg_dict[col] = [agg] - else: - raise ValueError(f"Unexpected aggregation: {agg}") - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - agg_dict - ) - - # Don't include the last aggregation in the column names - output_columns = [ - _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) - for name in gb.columns - ] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): - """Calculate variance (given count, sum, and sum-squared columns).""" - - # Select count, sum, and sum-squared - n = df[count_name] - x = df[sum_name] - x2 = df[pow2_sum_name] - - # Use sum-squared approach to get variance - var = x2 - x**2 / n - div = n - ddof - div[div < 1] = 1 # Avoid division by 0 - var /= div - - # Set appropriate NaN elements - # (since we avoided 0-division) - var[(n - ddof) == 0] = np.nan - - return var - - -@_dask_cudf_performance_tracking -def _finalize_gb_agg( - gb_in, - gb_cols, - aggs, - columns, - final_columns, - as_index, - dropna, - sort, - sep, - str_cols_out, - aggs_renames, -): - """Final aggregation task. - - This is the final operation on each output partitions - of the `groupby_agg` algorithm. This function must - take care of higher-order aggregations, like "mean", - "std" and "var". We also need to deal with the column - index, the row index, and final sorting behavior. - """ - - gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) - - # Deal with higher-order aggregations - for col in columns: - agg_list = aggs.get(col, []) - agg_set = set(agg_list) - if agg_set.intersection({"mean", "std", "var"}): - count_name = _make_name((col, "count"), sep=sep) - sum_name = _make_name((col, "sum"), sep=sep) - if agg_set.intersection({"std", "var"}): - pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) - var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) - if "var" in agg_list: - name_var = _make_name((col, "var"), sep=sep) - gb[name_var] = var - if "std" in agg_list: - name_std = _make_name((col, "std"), sep=sep) - gb[name_std] = np.sqrt(var) - gb.drop(columns=[pow2_sum_name], inplace=True) - if "mean" in agg_list: - mean_name = _make_name((col, "mean"), sep=sep) - gb[mean_name] = gb[sum_name] / gb[count_name] - if "sum" not in agg_list: - gb.drop(columns=[sum_name], inplace=True) - if "count" not in agg_list: - gb.drop(columns=[count_name], inplace=True) - if list in agg_list: - collect_name = _make_name((col, "list"), sep=sep) - gb[collect_name] = gb[collect_name].list.concat() - - # Ensure sorted keys if `sort=True` - if sort: - gb = gb.sort_values(gb_cols) - - # Set index if necessary - if as_index: - gb.set_index(gb_cols, inplace=True) - - # Unflatten column names - col_array = [] - agg_array = [] - for col in gb.columns: - if col in gb_cols: - col_array.append(col) - agg_array.append("") - else: - name, agg = col.split(sep) - col_array.append(name) - agg_array.append(aggs_renames.get((name, agg), agg)) - if str_cols_out: - gb.columns = col_array - else: - gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - return gb[final_columns] diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py index 0421bd755f4..c544c32523f 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -1,11 +1 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from .csv import read_csv # noqa: F401 -from .json import read_json # noqa: F401 -from .orc import read_orc, to_orc # noqa: F401 -from .text import read_text # noqa: F401 - -try: - from .parquet import read_parquet, to_parquet # noqa: F401 -except ImportError: - pass +# Copyright (c) 2018-2025, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py deleted file mode 100644 index fa5400344f9..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import os -from glob import glob -from warnings import warn - -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py deleted file mode 100644 index 98c5ceedb76..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/json.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from functools import partial - -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py deleted file mode 100644 index fcf684fd6c8..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(source, fs, columns=None, kwargs=None): - """Pull out specific columns from specific stripe""" - path, stripe = source - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, _, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - sources = [] - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - sources.append((path, stripe)) - - return dd.from_map( - _read_orc_stripe, - sources, - args=[fs], - columns=columns, - kwargs=kwargs, - meta=meta, - ) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0638e4a1c3..c0792663c7e 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools import warnings from functools import partial @@ -8,7 +8,7 @@ import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq -from dask import dataframe as dd +import dask.dataframe as dd from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine try: @@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet(path, columns=None, **kwargs): - """ - Read parquet files into a :class:`.DataFrame`. - - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. - - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. - - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP - - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: - - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP - - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: - - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP - - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] - - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) - if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) - ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size - - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) - - -to_parquet = partial(dd.to_parquet, engine=CudfEngine) +to_parquet = dd.to_parquet if create_metadata_file_dd is None: create_metadata_file = create_metadata_file_dd diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py deleted file mode 100644 index 3757c85c80c..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/text.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import os -from glob import glob - -import dask.dataframe as dd -from dask.utils import parse_bytes - -import cudf - - -def _read_text(source, **kwargs): - # Wrapper for cudf.read_text operation - fn, byte_range = source - return cudf.read_text(fn, byte_range=byte_range, **kwargs) - - -def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - if chunksize and byte_range: - raise ValueError("Cannot specify both chunksize and byte_range.") - - if chunksize: - sources = [] - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - byte_range = ( - start, - chunksize, - ) # specify which chunk of the file we care about - sources.append((fn, byte_range)) - else: - sources = [(fn, byte_range) for fn in filenames] - - return dd.from_map( - _read_text, - sources, - meta=cudf.Series([], dtype="O"), - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py deleted file mode 100644 index a2ba4d1878e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/sorting.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings -from collections.abc import Iterator -from functools import wraps - -import cupy -import numpy as np -import tlz as toolz - -from dask import config -from dask.base import tokenize -from dask.dataframe import methods -from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M - -import cudf -from cudf.api.types import _is_categorical_dtype -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -_SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported - - -def _deprecate_shuffle_kwarg(func): - @wraps(func) - def wrapper(*args, **kwargs): - old_arg_value = kwargs.pop("shuffle", None) - - if old_arg_value is not None: - new_arg_value = old_arg_value - msg = ( - "the 'shuffle' keyword is deprecated, " - "use 'shuffle_method' instead." - ) - - warnings.warn(msg, FutureWarning) - if kwargs.get("shuffle_method") is not None: - msg = ( - "Can only specify 'shuffle' " - "or 'shuffle_method', not both." - ) - raise TypeError(msg) - kwargs["shuffle_method"] = new_arg_value - return func(*args, **kwargs) - - return wrapper - - -@_dask_cudf_performance_tracking -def set_index_post(df, index_name, drop, column_dtype): - df2 = df.set_index(index_name, drop=drop) - df2.columns = df2.columns.astype(column_dtype) - return df2 - - -@_dask_cudf_performance_tracking -def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): - if ascending: - partitions = divisions.searchsorted(s, side="right") - 1 - else: - partitions = ( - len(divisions) - divisions.searchsorted(s, side="right") - 1 - ) - partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( - 0 if ascending else (len(divisions) - 2) - ) - partitions[s._columns[0].isnull().values] = ( - len(divisions) - 2 if na_position == "last" else 0 - ) - return partitions - - -@_dask_cudf_performance_tracking -def _quantile(a, q): - n = len(a) - if not len(a): - return None, n - return ( - a.quantile(q=q.tolist(), interpolation="nearest", method="table"), - n, - ) - - -@_dask_cudf_performance_tracking -def merge_quantiles(finalq, qs, vals): - """Combine several quantile calculations of different data. - [NOTE: Same logic as dask.array merge_percentiles] - """ - if isinstance(finalq, Iterator): - finalq = list(finalq) - finalq = np.array(finalq) - qs = list(map(list, qs)) - vals = list(vals) - vals, Ns = zip(*vals) - Ns = list(Ns) - - L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) - if not L: - raise ValueError("No non-trivial arrays found") - qs, vals, Ns = L - - if len(vals) != len(qs) or len(Ns) != len(qs): - raise ValueError("qs, vals, and Ns parameters must be the same length") - - # transform qs and Ns into number of observations between quantiles - counts = [] - for q, N in zip(qs, Ns): - count = np.empty(len(q)) - count[1:] = np.diff(q) - count[0] = q[0] - count *= N - counts.append(count) - - def _append_counts(val, count): - val["_counts"] = count - return val - - # Sort by calculated quantile values, then number of observations. - combined_vals_counts = cudf.core.reshape._merge_sorted( - [*map(_append_counts, vals, counts)] - ) - combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) - combined_vals = combined_vals_counts.drop(columns=["_counts"]) - - # quantile-like, but scaled by total number of observations - combined_q = np.cumsum(combined_counts) - - # rescale finalq quantiles to match combined_q - desired_q = finalq * sum(Ns) - - # TODO: Support other interpolation methods - # For now - Always use "nearest" for interpolation - left = np.searchsorted(combined_q, desired_q, side="left") - right = np.searchsorted(combined_q, desired_q, side="right") - 1 - np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index - lower = np.minimum(left, right) - upper = np.maximum(left, right) - lower_residual = np.abs(combined_q[lower] - desired_q) - upper_residual = np.abs(combined_q[upper] - desired_q) - mask = lower_residual > upper_residual - index = lower # alias; we no longer need lower - index[mask] = upper[mask] - rv = combined_vals.iloc[index] - return rv.reset_index(drop=True) - - -@_dask_cudf_performance_tracking -def _approximate_quantile(df, q): - """Approximate quantiles of DataFrame or Series. - [NOTE: Same logic as dask.dataframe Series quantile] - """ - # current implementation needs q to be sorted so - # sort if array-like, otherwise leave it alone - q_ndarray = np.array(q) - if q_ndarray.ndim > 0: - q_ndarray.sort(kind="mergesort") - q = q_ndarray - - # Lets assume we are dealing with a DataFrame throughout - if isinstance(df, (Series, Index)): - df = df.to_frame() - assert isinstance(df, DataFrame) - final_type = df._meta._constructor - - # Create metadata - meta = df._meta_nonempty.quantile(q=q, method="table") - - # Define final action (create df with quantiles as index) - def finalize_tsk(tsk): - return (final_type, tsk) - - return_type = df.__class__ - - # pandas/cudf uses quantile in [0, 1] - # numpy / cupy uses [0, 100] - qs = np.asarray(q) - token = tokenize(df, qs) - - if len(qs) == 0: - name = "quantiles-" + token - empty_index = cudf.Index([], dtype=float) - return Series( - { - (name, 0): final_type( - {col: [] for col in df.columns}, - name=df.name, - index=empty_index, - ) - }, - name, - df._meta, - [None, None], - ) - else: - new_divisions = [np.min(q), np.max(q)] - - name = "quantiles-1-" + token - val_dsk = { - (name, i): (_quantile, key, qs) - for i, key in enumerate(df.__dask_keys__()) - } - - name2 = "quantiles-2-" + token - merge_dsk = { - (name2, 0): finalize_tsk( - (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)) - ) - } - dsk = toolz.merge(val_dsk, merge_dsk) - graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) - df = return_type(graph, name2, meta, new_divisions) - - def set_quantile_index(df): - df.index = q - return df - - df = df.map_partitions(set_quantile_index, meta=meta) - return df - - -@_dask_cudf_performance_tracking -def quantile_divisions(df, by, npartitions): - qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() - divisions = _approximate_quantile(df[by], qn).compute() - columns = divisions.columns - - # TODO: Make sure divisions are correct for all dtypes.. - if ( - len(columns) == 1 - and df[columns[0]].dtype != "object" - and not _is_categorical_dtype(df[columns[0]].dtype) - ): - dtype = df[columns[0]].dtype - divisions = divisions[columns[0]].astype("int64") - divisions.iloc[-1] += 1 - divisions = sorted( - divisions.drop_duplicates().astype(dtype).to_arrow().tolist(), - key=lambda x: (x is None, x), - ) - else: - for col in columns: - dtype = df[col].dtype - if dtype != "object": - divisions[col] = divisions[col].astype("int64") - divisions[col].iloc[-1] += 1 - divisions[col] = divisions[col].astype(dtype) - else: - if last := divisions[col].iloc[-1]: - val = chr(ord(last[0]) + 1) - else: - val = "this string intentionally left empty" # any but "" - divisions[col].iloc[-1] = val - divisions = divisions.drop_duplicates().sort_index() - return divisions - - -@_deprecate_shuffle_kwarg -@_dask_cudf_performance_tracking -def sort_values( - df, - by, - max_branch=None, - divisions=None, - set_divisions=False, - ignore_index=False, - ascending=True, - na_position="last", - shuffle_method=None, - sort_function=None, - sort_function_kwargs=None, -): - """Sort by the given list/tuple of column names.""" - - if not isinstance(ascending, bool): - raise ValueError("ascending must be either True or False") - if na_position not in ("first", "last"): - raise ValueError("na_position must be either 'first' or 'last'") - - npartitions = df.npartitions - if isinstance(by, tuple): - by = list(by) - elif not isinstance(by, list): - by = [by] - - # parse custom sort function / kwargs if provided - sort_kwargs = { - "by": by, - "ascending": ascending, - "na_position": na_position, - } - if sort_function is None: - sort_function = M.sort_values - if sort_function_kwargs is not None: - sort_kwargs.update(sort_function_kwargs) - - # handle single partition case - if npartitions == 1: - return df.map_partitions(sort_function, **sort_kwargs) - - # Step 1 - Calculate new divisions (if necessary) - if divisions is None: - divisions = quantile_divisions(df, by, npartitions) - - # Step 2 - Perform repartitioning shuffle - meta = df._meta._constructor_sliced([0]) - if not isinstance(divisions, (cudf.Series, cudf.DataFrame)): - dtype = df[by[0]].dtype - divisions = df._meta._constructor_sliced(divisions, dtype=dtype) - - partitions = df[by].map_partitions( - _set_partitions_pre, - divisions=divisions, - ascending=ascending, - na_position=na_position, - meta=meta, - ) - - df2 = df.assign(_partitions=partitions) - df3 = rearrange_by_column( - df2, - "_partitions", - max_branch=max_branch, - npartitions=len(divisions) - 1, - shuffle_method=_get_shuffle_method(shuffle_method), - ignore_index=ignore_index, - ).drop(columns=["_partitions"]) - df3.divisions = (None,) * (df3.npartitions + 1) - - # Step 3 - Return final sorted df - df4 = df3.map_partitions(sort_function, **sort_kwargs) - if not isinstance(divisions, cudf.DataFrame) and set_divisions: - # Can't have multi-column divisions elsewhere in dask (yet) - df4.divisions = tuple(methods.tolist(divisions)) - - return df4 - - -def get_default_shuffle_method(): - # Note that `dask.utils.get_default_shuffle_method` - # will return "p2p" by default when a distributed - # client is present. Dask-cudf supports "p2p", but - # will not use it by default (yet) - default = config.get("dataframe.shuffle.method", "tasks") - if default not in _SHUFFLE_SUPPORT: - default = "tasks" - return default - - -def _get_shuffle_method(shuffle_method): - # Utility to set the shuffle_method-kwarg default - # and to validate user-specified options - shuffle_method = shuffle_method or get_default_shuffle_method() - if shuffle_method not in _SHUFFLE_SUPPORT: - raise ValueError( - "Dask-cudf only supports the following shuffle " - f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}" - ) - - return shuffle_method diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index fceaaf185e8..f33733d9583 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -11,14 +11,12 @@ from packaging.version import Version from pandas.api.types import is_scalar -import dask.dataframe as dd from dask import config from dask.array.dispatch import percentile_lookup from dask.dataframe.backends import ( DataFrameBackendEntrypoint, PandasBackendEntrypoint, ) -from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( categorical_dtype_dispatch, concat_dispatch, @@ -28,6 +26,8 @@ hash_object_dispatch, is_categorical_dtype_dispatch, make_meta_dispatch, + meta_nonempty, + partd_encode_dispatch, pyarrow_schema_dispatch, to_pyarrow_table_dispatch, tolist_dispatch, @@ -46,13 +46,6 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from ._legacy.core import DataFrame, Index, Series - -get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) -get_parallel_type.register(cudf.Series, lambda _: Series) -get_parallel_type.register(cudf.BaseIndex, lambda _: Index) - - # Required for Arrow filesystem support in read_parquet PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") @@ -318,7 +311,7 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( - (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) + (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype) # , Series) ) @_dask_cudf_performance_tracking def is_categorical_dtype_cudf(obj): @@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj): return obj.memory_usage() -# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0 -try: - from dask.dataframe.dispatch import partd_encode_dispatch - - @partd_encode_dispatch.register(cudf.DataFrame) - def _simple_cudf_encode(_): - # Basic pickle-based encoding for a partd k-v store - import pickle +@partd_encode_dispatch.register(cudf.DataFrame) +def _simple_cudf_encode(_): + # Basic pickle-based encoding for a partd k-v store + import pickle - import partd + import partd - def join(dfs): - if not dfs: - return cudf.DataFrame() - else: - return cudf.concat(dfs) - - dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) - return partial(partd.Encode, dumps, pickle.loads, join) + def join(dfs): + if not dfs: + return cudf.DataFrame() + else: + return cudf.concat(dfs) -except ImportError: - pass + dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) + return partial(partd.Encode, dumps, pickle.loads, join) def _default_backend(func, *args, **kwargs): @@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs): return data -# Define "cudf" backend engine to be registered with Dask -class CudfBackendEntrypoint(DataFrameBackendEntrypoint): - """Backend-entrypoint class for Dask-DataFrame +# Define the "cudf" backend for "legacy" Dask DataFrame +class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint): + """Backend-entrypoint class for legacy Dask-DataFrame This class is registered under the name "cudf" for the - ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. - Dask-DataFrame will use the methods defined in this class - in place of ``dask.dataframe.`` when the - "dataframe.backend" configuration is set to "cudf": - - Examples - -------- - >>> import dask - >>> import dask.dataframe as dd - >>> with dask.config.set({"dataframe.backend": "cudf"}): - ... ddf = dd.from_dict({"a": range(10)}) - >>> type(ddf) - + ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``. + This "legacy" backend is only used for CSV support. """ - @classmethod - def to_backend_dispatch(cls): - return to_cudf_dispatch - - @classmethod - def to_backend(cls, data: dd.core._Frame, **kwargs): - if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)): - # Already a cudf-backed collection - _unsupported_kwargs("cudf", "cudf", kwargs) - return data - return data.map_partitions(cls.to_backend_dispatch(), **kwargs) - - @staticmethod - def from_dict( - data, - npartitions, - orient="columns", - dtype=None, - columns=None, - constructor=cudf.DataFrame, - ): - return _default_backend( - dd.from_dict, - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - constructor=constructor, - ) - - @staticmethod - def read_parquet(*args, engine=None, **kwargs): - from dask_cudf._legacy.io.parquet import CudfEngine - - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dd.read_parquet, - *args, - engine=CudfEngine, - **kwargs, - ) - - @staticmethod - def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json - - return read_json(*args, **kwargs) - @staticmethod - def read_orc(*args, **kwargs): - from dask_cudf._legacy.io import read_orc - - return read_orc(*args, **kwargs) - - @staticmethod - def read_csv(*args, **kwargs): - from dask_cudf._legacy.io import read_csv - - return read_csv(*args, **kwargs) - - @staticmethod - def read_hdf(*args, **kwargs): - # HDF5 reader not yet implemented in cudf - warnings.warn( - "read_hdf is not yet implemented in cudf/dask_cudf. " - "Moving to cudf from pandas. Expect poor performance!" - ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( - "cudf" - ) - - -# Define "cudf" backend entrypoint for dask-expr -class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): +# Define the "cudf" backend for expr-based Dask DataFrame +class CudfBackendEntrypoint(DataFrameBackendEntrypoint): """Backend-entrypoint class for Dask-Expressions This class is registered under the name "cudf" for the - ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``. + ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``. Dask-DataFrame will use the methods defined in this class in place of ``dask_expr.`` when the "dataframe.backend" configuration is set to "cudf": @@ -746,12 +649,12 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json as read_json_impl + from dask_cudf.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc + from dask_cudf.io.orc import read_orc as legacy_read_orc return legacy_read_orc(*args, **kwargs) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 5fd217209ec..32461104ef9 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,56 +1,41 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import textwrap +import warnings +from importlib import import_module import dask.dataframe as dd -from dask.tokenize import tokenize import cudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking # This module provides backward compatibility for legacy import patterns. -if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( - DataFrame, - Index, - Series, - ) -else: - from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - +from dask_cudf._expr.collection import ( + DataFrame, # noqa: F401 + Index, # noqa: F401 + Series, # noqa: F401 +) concat = dd.concat @_dask_cudf_performance_tracking def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): - from dask_cudf import QUERY_PLANNING_ON - if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( "dask_cudf does not support MultiIndex Dataframes." ) - # Dask-expr doesn't support the `name` argument - name = {} - if not QUERY_PLANNING_ON: - name = { - "name": name - or ("from_cudf-" + tokenize(data, npartitions or chunksize)) - } - return dd.from_pandas( data, npartitions=npartitions, chunksize=chunksize, sort=sort, - **name, ) -from_cudf.__doc__ = ( - textwrap.dedent( - """ +from_cudf.__doc__ = textwrap.dedent( + """ Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. This function is a thin wrapper around @@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): arguments (described below) excepting that it operates on cuDF rather than pandas objects.\n """ - ) - # TODO: `dd.from_pandas.__doc__` is empty when - # `DASK_DATAFRAME__QUERY_PLANNING=True` - # since dask-expr does not provide a docstring for from_pandas. - + textwrap.dedent(dd.from_pandas.__doc__ or "") -) +) + textwrap.dedent(dd.from_pandas.__doc__) + + +def _deprecated_api(old_api, new_api=None, rec=None): + def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error + raise NotImplementedError( + f"{old_api} is no longer supported. " + (rec or "") + ) + + return inner_func diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 9bca33e414a..a5175c9bbe7 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from dask_cudf.core import _deprecated_api from . import csv, json, orc, parquet, text # noqa: F401 @@ -15,20 +15,13 @@ ) to_orc = _deprecated_api( "dask_cudf.io.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.orc.to_orc", rec="Please use the DataFrame.to_orc method instead.", ) read_text = _deprecated_api( "dask_cudf.io.read_text", new_api="dask_cudf.read_text" ) -if QUERY_PLANNING_ON: - read_parquet = parquet.read_parquet -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = parquet.read_parquet to_parquet = _deprecated_api( "dask_cudf.io.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 29f98b14511..e36ee04d827 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import os from glob import glob @@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs): >>> import dask_cudf >>> df = dask_cudf.read_csv("myfiles.*.csv") - In some cases it can break up large files: + It can break up large files if blocksize is specified: >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - It can read CSV files from external resources (e.g. S3, HTTP, FTP) + It can read CSV files from external resources (e.g. S3, HTTP, FTP): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") @@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs): ---------- path : str, path object, or file-like object Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as + ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3 + locations), or any object with a ``read()`` method (such as builtin :py:func:`open` file handler function or :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to + Passthrough keyword arguments that are sent to :func:`cudf:cudf.read_csv`. Notes diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 8f85ea54c0a..3022ebb2a5b 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,8 +1,209 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +from functools import partial -read_json = _deprecated_api( - "dask_cudf.io.json.read_json", - new_api="dask_cudf.read_json", -) +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5219cdacc31..5de28751912 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,13 +1,195 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_cudf import _deprecated_api - -read_orc = _deprecated_api( - "dask_cudf.io.orc.read_orc", - new_api="dask_cudf.read_orc", -) -to_orc = _deprecated_api( - "dask_cudf.io.orc.to_orc", - new_api="dask_cudf._legacy.io.orc.to_orc", - rec="Please use the DataFrame.to_orc method instead.", -) +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(source, fs, columns=None, kwargs=None): + """Pull out specific columns from specific stripe""" + path, stripe = source + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, _, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + sources = [] + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + sources.append((path, stripe)) + + return dd.from_map( + _read_orc_stripe, + sources, + args=[fs], + columns=columns, + kwargs=kwargs, + meta=meta, + ) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba6209c4820..a953dce787d 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -37,10 +37,9 @@ def TaskList(*x): import cudf -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api - # Dask-expr imports CudfEngine from this module from dask_cudf._legacy.io.parquet import CudfEngine +from dask_cudf.core import _deprecated_api if TYPE_CHECKING: from collections.abc import MutableMapping @@ -832,15 +831,8 @@ def read_parquet_expr( ) -if QUERY_PLANNING_ON: - read_parquet = read_parquet_expr - read_parquet.__doc__ = read_parquet_expr.__doc__ -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.parquet.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = read_parquet_expr +read_parquet.__doc__ = read_parquet_expr.__doc__ to_parquet = _deprecated_api( "dask_cudf.io.parquet.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index f5509cf91c3..48eca13e16f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import math import os @@ -11,10 +11,6 @@ from dask.utils import tmpfile import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): @@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path): with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): df2 = dask_cudf.io.read_json(path) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): - df2 = dask_cudf.io.json.read_json(path) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index b6064d851ca..4aac463420b 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import glob import os @@ -12,10 +12,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") @@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): df2 = dask_cudf.io.read_orc(paths) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): - df2 = dask_cudf.io.orc.read_orc(paths) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6efe6c4f388..9f7031f4d2a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import glob import math @@ -16,11 +16,6 @@ import dask_cudf from dask_cudf._legacy.io.parquet import create_metadata_file -from dask_cudf.tests.utils import ( - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) # Check if create_metadata_file is supported by # the current dask.dataframe version @@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on): dd.assert_eq(ddf1, ddf2) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @need_create_meta def test_create_metadata_file_inconsistent_schema(tmpdir): # NOTE: This test demonstrates that the CudfEngine @@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir): dd.assert_eq(df, new_ddf) -@skip_dask_expr("Not necessary in dask-expr") -def test_check_file_size(tmpdir): - # Test simple file-size check to help warn users - # of upstream change to `split_row_groups` default - fn = str(tmpdir.join("test.parquet")) - cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) - with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf._legacy.io` path - # TODO: Remove outdated `check_file_size` functionality - dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() - - -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning @@ -626,7 +607,6 @@ def test_timezone_column(tmpdir): dd.assert_eq(got, expect) -@require_dask_expr() @pytest.mark.skipif( not dask_cudf.backends.PYARROW_GE_15, reason="Requires pyarrow 15", @@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): dask_cudf.io.to_parquet(df, tmpdir) - if dask_cudf.QUERY_PLANNING_ON: - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - else: - with pytest.warns(match="legacy dask_cudf.io.read_parquet"): - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) - with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"): - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index 90907f6fb99..7c53b89a883 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import os import socket @@ -14,7 +14,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises(): pytest.param( "arrow", marks=pytest.mark.skipif( - not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + not dask_cudf.backends.PYARROW_GE_15, reason="Not supported", ), ), diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e35b6411a9d..f4d59334e03 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import os @@ -9,10 +9,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") @@ -42,7 +38,3 @@ def test_deprecated_api_paths(): with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): df2 = dask_cudf.io.read_text(text_file, delimiter=".") dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): - df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 1caf4e81d8e..eb1d007cc16 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,8 +1,56 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +import os +from glob import glob -read_text = _deprecated_api( - "dask_cudf.io.text.read_text", - new_api="dask_cudf.read_text", -) +import dask.dataframe as dd +from dask.utils import parse_bytes + +import cudf + + +def _read_text(source, **kwargs): + # Wrapper for cudf.read_text operation + fn, byte_range = source + return cudf.read_text(fn, byte_range=byte_range, **kwargs) + + +def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + if chunksize and byte_range: + raise ValueError("Cannot specify both chunksize and byte_range.") + + if chunksize: + sources = [] + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + byte_range = ( + start, + chunksize, + ) # specify which chunk of the file we care about + sources.append((fn, byte_range)) + else: + sources = [(fn, byte_range) for fn in filenames] + + return dd.from_map( + _read_text, + sources, + meta=cudf.Series([], dtype="O"), + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 3fbb2aacd2c..c6b01a648eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,7 +13,6 @@ from cudf.testing._utils import does_not_raise import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr ############################################################################# # Datetime Accessor # @@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7101fb7e00a..31957a106ff 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import random @@ -9,18 +9,12 @@ import dask from dask import dataframe as dd -from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty +from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty from dask.utils import M import cudf import dask_cudf -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) rng = np.random.default_rng(seed=0) @@ -299,37 +293,6 @@ def test_set_index_sorted(): gddf1.set_index("val", sorted=True) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -@pytest.mark.parametrize("index", [None, "myindex"]) -def test_rearrange_by_divisions(nelem, index): - with dask.config.set(scheduler="single-threaded"): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 20, size=nelem), - "y": rng.normal(size=nelem), - "z": rng.choice(["dog", "cat", "bird"], nelem), - } - ) - df["z"] = df["z"].astype("category") - - ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf( - cudf.DataFrame.from_pandas(df), npartitions=4 - ) - ddf1.index.name = index - gdf1.index.name = index - divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) - - expect = dd.shuffle.rearrange_by_divisions( - ddf1, "x", divisions=divisions, shuffle_method="tasks" - ) - result = dd.shuffle.rearrange_by_divisions( - gdf1, "x", divisions=divisions, shuffle_method="tasks" - ) - dd.assert_eq(expect, result) - - def test_assign(): rng = np.random.default_rng(seed=0) df = pd.DataFrame( @@ -393,44 +356,6 @@ def test_setitem_scalar_datetime(): np.testing.assert_array_equal(got["z"], df["z"]) -@skip_dask_expr("Not relevant for dask-expr") -@pytest.mark.parametrize( - "func", - [ - lambda: pd.DataFrame( - {"A": rng.random(10), "B": rng.random(10)}, - index=list("abcdefghij"), - ), - lambda: pd.DataFrame( - { - "A": rng.random(10), - "B": list("a" * 10), - "C": pd.Series( - [str(20090101 + i) for i in range(10)], - dtype="datetime64[ns]", - ), - }, - index=list("abcdefghij"), - ), - lambda: pd.Series(list("abcdefghijklmnop")), - lambda: pd.Series( - rng.random(10), - index=pd.Index( - [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" - ), - ), - ], -) -def test_repr(func): - pdf = func() - gdf = cudf.from_pandas(pdf) - gddf = dd.from_pandas(gdf, npartitions=3, sort=False) - - assert repr(gddf) - if hasattr(pdf, "_repr_html_"): - assert gddf._repr_html_() - - @pytest.mark.skip(reason="datetime indexes not fully supported in cudf") @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"]) @pytest.mark.parametrize("stop", ["1d", "3d", "8h"]) @@ -657,20 +582,20 @@ def test_hash_object_dispatch(index): ) # DataFrame - result = dd.core.hash_object_dispatch(obj, index=index) + result = dd.dispatch.hash_object_dispatch(obj, index=index) expected = dask_cudf.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series - result = dd.core.hash_object_dispatch(obj["x"], index=index) + result = dd.dispatch.hash_object_dispatch(obj["x"], index=index) expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) - result = dd.core.hash_object_dispatch(obj_multi, index=index) + result = dd.dispatch.hash_object_dispatch(obj_multi, index=index) expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) @@ -784,7 +709,6 @@ def test_dataframe_set_index(): assert_eq(ddf.compute(), pddf.compute()) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_series_describe(): random.seed(0) sr = cudf.datasets.randomdata(20)["x"] @@ -800,7 +724,6 @@ def test_series_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_dataframe_describe(): random.seed(0) df = cudf.datasets.randomdata(20) @@ -814,7 +737,6 @@ def test_dataframe_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_zero_std_describe(): num = 84886781 df = cudf.DataFrame( @@ -864,7 +786,7 @@ def test_merging_categorical_columns(): ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) - ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) + ddf_1 = ddf_1.categorize(columns=["cat_col"]) df_2 = cudf.DataFrame( {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} @@ -872,7 +794,7 @@ def test_merging_categorical_columns(): ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) - ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) + ddf_2 = ddf_2.categorize(columns=["cat_col"]) expected = cudf.DataFrame( { @@ -932,14 +854,9 @@ def func(x): result = ds.map_partitions(func, meta=s.values) - if QUERY_PLANNING_ON: - # Check Array and round-tripped DataFrame - dask.array.assert_eq(result, func(s)) - dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) - else: - # Legacy version still carries numpy metadata - # See: https://github.com/dask/dask/issues/11017 - dask.array.assert_eq(result.compute(), func(s)) + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) def test_implicit_array_conversion_cupy_sparse(): @@ -981,7 +898,6 @@ def test_series_isin_error(): ddf.isin([1, 5, "a"]).compute() -@require_dask_expr() def test_to_backend_simplify(): # Check that column projection is not blocked by to_backend with dask.config.set({"dataframe.backend": "pandas"}): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 9bd3b506db0..11ca0c6a783 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,12 +13,7 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - xfail_dask_expr, -) +from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized def assert_cudf_groupby_layers(ddf): @@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf): expect = getattr(gdf_grouped, aggregation)() actual = getattr(ddf_grouped, aggregation)() - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) if not series: expect = gdf_grouped.agg({"x": aggregation}) actual = ddf_grouped.agg({"x": aggregation}) - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) @@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf): check_dtype = aggregation != "count" - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - - # groupby.agg should add an explicit getitem layer - # to improve/enable column projection - assert hlg_layer(actual.dask, "getitem") - dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype) @@ -556,20 +538,13 @@ def test_groupby_categorical_key(): True, pytest.param( False, - marks=xfail_dask_expr("as_index not supported in dask-expr"), - ), - ], -) -@pytest.mark.parametrize( - "fused", - [ - True, - pytest.param( - False, - marks=require_dask_expr("Not supported by legacy API"), + marks=pytest.mark.xfail( + reason="as_index not supported in dask-expr" + ), ), ], ) +@pytest.mark.parametrize("fused", [True, False]) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) @@ -590,19 +565,16 @@ def test_groupby_agg_params( "c": ["mean", "std", "var"], } - fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} + fused_kwarg = {"fused": fused} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") # Avoid using as_index when query-planning is enabled - if QUERY_PLANNING_ON: - with pytest.warns(FutureWarning, match="argument is now deprecated"): - # Should warn when `as_index` is used - ddf.groupby(["name", "a"], sort=False, as_index=as_index) - maybe_as_index = {"as_index": as_index} if as_index is False else {} - else: - maybe_as_index = {"as_index": as_index} + with pytest.warns(FutureWarning, match="argument is now deprecated"): + # Should warn when `as_index` is used + ddf.groupby(["name", "a"], sort=False, as_index=as_index) + maybe_as_index = {"as_index": as_index} if as_index is False else {} # Check `sort=True` behavior if split_out == 1: @@ -671,7 +643,6 @@ def test_groupby_agg_params( dd.assert_eq(gf, pf) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) @@ -711,7 +682,6 @@ def test_is_supported(arg, supported): assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) gdf = cudf.from_pandas(df) @@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg): ) -@xfail_dask_expr("Co-alignment check fails in dask-expr") +@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr") def test_groupby_with_list_of_series(): df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) gdf = dask_cudf.from_cudf(df, npartitions=2) @@ -773,7 +743,6 @@ def test_groupby_with_list_of_series(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "func", [ @@ -833,7 +802,7 @@ def test_groupby_all_columns(func): expect = func(ddf) actual = func(gddf) - dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) + dd.assert_eq(expect, actual, check_names=False) def test_groupby_shuffle(): @@ -870,15 +839,3 @@ def test_groupby_shuffle(): # NOTE: `shuffle_method=True` should be default got = gddf.groupby("a", sort=False).agg(spec, split_out=2) dd.assert_eq(expect, got.compute().sort_index()) - - if not QUERY_PLANNING_ON: - # Sorted aggregation fails with split_out>1 when shuffle is False - # (sort=True, split_out=2, shuffle_method=False) - with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg( - spec, shuffle_method=False, split_out=2 - ) - - # Check shuffle kwarg deprecation - with pytest.warns(match="'shuffle' keyword is deprecated"): - gddf.groupby("a", sort=True).agg(spec, shuffle=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index 0b7c7855e07..2d05345bc4a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -8,12 +8,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr - -# No dask-expr support -pytestmark = xfail_dask_expr( - "Newer dask version needed", lt_version="2024.5.0" -) def test_get_dummies_cat(): diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 02c815427f3..68d6e72660e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -10,7 +10,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -67,7 +66,6 @@ def test_sort_repartition(): dd.assert_eq(len(new_ddf), len(ddf)) -@xfail_dask_expr("missing null support", lt_version="2024.5.1") @pytest.mark.parametrize("na_position", ["first", "last"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index b44b3f939e7..ef6765f39d1 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -1,22 +1,12 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd -import pytest -from packaging.version import Version -import dask import dask.dataframe as dd import cudf -from dask_cudf import QUERY_PLANNING_ON - -if QUERY_PLANNING_ON: - DASK_VERSION = Version(dask.__version__) -else: - DASK_VERSION = None - def _make_random_frame(nelem, npartitions=2, include_na=False): rng = np.random.default_rng(seed=0) @@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): gdf = cudf.DataFrame.from_pandas(df) dgf = dd.from_pandas(gdf, npartitions=npartitions) return df, dgf - - -_default_reason = "Not compatible with dask-expr" - - -def skip_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - skip = QUERY_PLANNING_ON - return pytest.mark.skipif(skip, reason=reason) - - -def xfail_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - xfail = QUERY_PLANNING_ON - return pytest.mark.xfail(xfail, reason=reason) - - -def require_dask_expr(reason="requires dask-expr"): - return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index a8cb696d7f6..b88816a3d47 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -39,10 +39,10 @@ classifiers = [ ] [project.entry-points."dask.dataframe.backends"] -cudf = "dask_cudf.backends:CudfBackendEntrypoint" +cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint" [project.entry-points."dask_expr.dataframe.backends"] -cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" +cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ @@ -102,8 +102,5 @@ filterwarnings = [ # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", - # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 - # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` - "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] xfail_strict = true From dc99d2f9bc602e40c7bae894b6759e30a8efdddd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Jan 2025 13:00:30 +0000 Subject: [PATCH 09/26] Introduce some simple benchmarks for rolling window aggregations (#17613) Previously we did not have any benchmarks for rolling aggregations. Introduce some, so we can measure the effects of any performance improvements we might make. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17613 --- cpp/benchmarks/CMakeLists.txt | 7 +- .../rolling/grouped_rolling_sum.cpp | 70 +++++++++ cpp/benchmarks/rolling/rolling_sum.cpp | 134 ++++++++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/rolling/grouped_rolling_sum.cpp create mode 100644 cpp/benchmarks/rolling/rolling_sum.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 749e1b628ee..0ff712c1c77 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) # --------------------------------------------------------------------------------- ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) +# ################################################################################################## +# * rolling benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp new file mode 100644 index 00000000000..04afe5ac661 --- /dev/null +++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + return cudf::sort(cudf::table_view{{keys->view()}}); + }(); + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = cudf::grouped_rolling_window( + keys->view(), vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_grouped_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 28}) + .add_int64_axis("preceding_size", {1, 10}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1}) + .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000}); diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp new file mode 100644 index 00000000000..af9ecd6a26f --- /dev/null +++ b/cpp/benchmarks/rolling/rolling_sum.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +template +void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + + auto vals = [&]() { + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto preceding = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) { + return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto following = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) { + return std::max(-i - 1, std::min(following_size, num_rows - i - 1)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_fixed_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {1, 10, 100}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1, 20}); + +NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_variable_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {10, 100}) + .add_int64_axis("following_size", {2}); From d05b78b13e3cce55e1691de86b0c6020d4f1b0cd Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:54:22 -0500 Subject: [PATCH 10/26] Fix build metrics report format with long placehold filenames (#17679) Truncates filenames that appear as multiple `placehold_placedhold_...` in the Build Metrics Report. Example show here: https://downloads.rapids.ai/ci/cudf/pull-request/17669/0710ad6/cuda12_x86_64.ninja_log.html (requires VPN). Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17679 --- cpp/scripts/sort_ninja_log.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index 42f84e4d0c7..e111367d191 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,8 +1,9 @@ # -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # import argparse import os +import re import sys import xml.etree.ElementTree as ET from pathlib import Path @@ -144,6 +145,16 @@ def format_file_size(input_size): return file_size_str +def replace_placeholder_patterns(input_string: str) -> str: + pattern = r'(_h_env_placehold)[_placehold]+' + return re.sub(pattern, r'\1...', input_string) + + +# adjust name for display +def format_file_name(name: str) -> str: + return replace_placeholder_patterns(name) + + # Output chart results in HTML format # Builds a standalone html file with no javascript or styles def output_html(entries, sorted_list, cmp_entries, args): @@ -223,7 +234,8 @@ def output_html(entries, sorted_list, cmp_entries, args): print("", end="") # use a slightly smaller, fixed-width font @@ -265,7 +277,8 @@ def output_html(entries, sorted_list, cmp_entries, args): file_size_str = format_file_size(file_size) # output entry row - print("", name, "", sep="", end="") + display_name = format_file_name(name) + print("", display_name, "", sep="", end="") print("", build_time_str, "", sep="", end="") print("", file_size_str, "", sep="", end="") # output diff column From a38ce0a2447e9bca15f3a904c54fe1eba27e5940 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:55:09 -0500 Subject: [PATCH 11/26] Remove pragma GCC diagnostic from source files (#17637) Removes the `#pragma GCC diagnostic` from several source files. These do not seem to be necessary and could suppress useful compile warnings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17637 --- cpp/src/join/mixed_join_kernel.cuh | 4 +--- cpp/src/join/mixed_join_kernels_semi.cu | 4 +--- cpp/src/join/mixed_join_size_kernel.cuh | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 368b1fba870..4565626edad 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,8 +37,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index a4ec97af235..4c063b6202e 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,8 +30,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join_semi(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 98170ed719a..869d05ce4d3 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,8 +34,6 @@ namespace cudf { namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) compute_mixed_join_output_size(table_device_view left_table, From da316860211281807e39fadb2a543bcdd6f56abb Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Jan 2025 16:25:14 +0000 Subject: [PATCH 12/26] Skip polars test that can generate timezones that chrono_tz doesn't know (#17694) On Ubuntu 20.04, the tzdata package contains a bunch of symlinks for obsolete timezone names. However, the chrono_tz package that polars uses doesn't read /usr/share/zoneinfo, instead packaging the current zoneinfo database from IANA. Consequently, when this hypothesis-generated test runs and generates timezones from the available zoneinfo-reported timezones, we can get an error from polars that the requested timezone is unknown. Since this is random, just skip it, rather than xfailing. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Murray (https://github.com/Matt711) - Peter Andreas Entschev (https://github.com/pentschev) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17694 --- .../cudf_polars/cudf_polars/testing/plugin.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 7a759eea2e9..87628242838 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Plugin for running polars test suite setting GPU engine as default.""" @@ -174,6 +174,19 @@ def pytest_configure(config: pytest.Config) -> None: } +TESTS_TO_SKIP: Mapping[str, str] = { + # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks + # for obsolete timezone names. However, the chrono_tz package that + # polars uses doesn't read /usr/share/zoneinfo, instead packaging + # the current zoneinfo database from IANA. Consequently, when this + # hypothesis-generated test runs and generates timezones from the + # available zoneinfo-reported timezones, we can get an error from + # polars that the requested timezone is unknown. + # Since this is random, just skip it, rather than xfailing. + "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", +} + + def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] ) -> None: @@ -182,5 +195,7 @@ def pytest_collection_modifyitems( # Don't xfail tests if running without fallback return for item in items: - if item.nodeid in EXPECTED_FAILURES: + if item.nodeid in TESTS_TO_SKIP: + item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) + elif item.nodeid in EXPECTED_FAILURES: item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) From f017f869829cb05694d195aab0f118357c0dbbd8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 8 Jan 2025 09:30:40 -0800 Subject: [PATCH 13/26] Control pinned memory use with environment variables (#17657) Adds `LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD` and `LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD` environment variables to set the pinned memory optimizations' thresholds. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17657 --- cpp/src/utilities/host_memory.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 73c4567d3a4..94d27d976c3 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/getenv_or.hpp" + #include #include #include @@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) CUDF_EXPORT auto& kernel_pinned_copy_threshold() { // use cudaMemcpyAsync for all pinned copies - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0); return threshold; } @@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0); return threshold; } From f1cb88df8eb7862a82969dfdfd746886198a9b22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:41:20 -0800 Subject: [PATCH 14/26] Define cudf repr methods on the Column (#17675) Refactors cudf Python objects' repr handling to define the core conversion of "cleaning" nulls at the column level and then rolling up the conversions at the `Frame` and its subclasses level. Notable positive changes: * `repr(cudf.Series)` no longer deep copies * Fixes a bug when `repr(cudf.Series)` with a timedelta type to better match pandas (adjusted unit tests accordingly) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17675 --- python/cudf/cudf/core/_base_index.py | 4 +- python/cudf/cudf/core/column/column.py | 16 +++++- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/lists.py | 13 ++++- python/cudf/cudf/core/column/struct.py | 13 ++++- python/cudf/cudf/core/column/timedelta.py | 2 + python/cudf/cudf/core/dataframe.py | 53 +++---------------- python/cudf/cudf/core/frame.py | 9 +++- python/cudf/cudf/core/index.py | 30 ++--------- python/cudf/cudf/core/indexed_frame.py | 6 +++ python/cudf/cudf/core/multiindex.py | 19 +++---- python/cudf/cudf/core/series.py | 29 ++--------- python/cudf/cudf/tests/test_repr.py | 62 +++++++++++------------ 13 files changed, 117 insertions(+), 143 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c2f3c782d10..2806a1f6c23 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -350,7 +350,7 @@ def names(self, values): self.name = values[0] - def _clean_nulls_from_index(self): + def _pandas_repr_compatible(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 24b657f1c32..ef815e44d9d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -77,6 +77,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.strings import StringColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -92,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } + _PANDAS_NA_REPR = str(pd.NA) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -176,6 +179,17 @@ def __repr__(self): f"dtype: {self.dtype}" ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + if self.has_nulls(): + return self.astype("str").fillna(self._PANDAS_NA_REPR) + return self + def to_pandas( self, *, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6a4122ebb9..80551e33115 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase): "__rsub__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6283e498842..9c5041df521 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -28,6 +28,7 @@ from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): @@ -67,6 +68,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @cached_property def memory_usage(self): n = super().memory_usage diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ba765b50729..052a68cec98 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -18,6 +18,7 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class StructColumn(ColumnBase): @@ -51,6 +52,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @staticmethod def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: # IntervalDtype is a subclass of StructDtype, so compare types exactly diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 749ab8e837a..302178ea277 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase): "__rfloordiv__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2121511a14..40d36a6ff56 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -1894,7 +1894,7 @@ def astype( dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) - def _clean_renderable_dataframe(self, output): + def _clean_renderable_dataframe(self, output: Self) -> str: """ This method takes in partial/preprocessed dataframe and returns correct representation of it with correct @@ -1929,41 +1929,7 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): + def _get_renderable_dataframe(self) -> Self: """ Takes rows and columns from pandas settings or estimation from size. pulls quadrants based off of some known parameters then style for @@ -1971,9 +1937,9 @@ def _get_renderable_dataframe(self): for printing with the dataframe. """ max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) + if max_rows in {0, None}: + max_rows = len(self) + nrows = max(max_rows, 1) ncols = ( pd.options.display.max_columns if pd.options.display.max_columns @@ -1981,7 +1947,7 @@ def _get_renderable_dataframe(self): ) if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) + output = self elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items # In case of Empty DataFrame with index, Pandas prints @@ -2041,10 +2007,7 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output + return output._pandas_repr_compatible() @_performance_tracking def __repr__(self): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8f45c6f0115..abf9f7b3686 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -820,6 +820,13 @@ def fillna( inplace=inplace, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + columns = (col._prep_pandas_compat_repr() for col in self._columns) + return self._from_data_like_self( + self._data._from_columns_like_self(columns, verify=False) + ) + @_performance_tracking def _drop_column( self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 85be8d21d27..54635b162bc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase: else: return column.column_empty(0, dtype=self.dtype) - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self def _is_numeric(self) -> bool: @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out - @classmethod @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out + def _from_data_like_self(self, data: MutableMapping) -> Self: + return _index_from_data(data, self.name) @classmethod @_performance_tracking @@ -1494,7 +1488,7 @@ def __repr__(self) -> str: if isinstance(self._values, StringColumn): output = repr(self.to_pandas(nullable=True)) else: - output = repr(self._clean_nulls_from_index().to_pandas()) + output = repr(self._pandas_repr_compatible().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool: hash(item) return item in self._column - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - def any(self) -> bool: return self._column.any() @@ -3615,7 +3595,7 @@ def _is_interval(self) -> bool: def _is_boolean(self) -> bool: return False - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self @property diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9ed74f804b..c779e1ebe97 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4410,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): index_names=self.index.names if keep_index else None, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + result = super()._pandas_repr_compatible() + result.index = self.index._pandas_repr_compatible() + return result + def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..e7efd01ca85 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -361,6 +361,13 @@ def _from_data( name=name, ) + @_performance_tracking + def _from_data_like_self(self, data: MutableMapping) -> Self: + mi = type(self)._from_data(data, name=self.name) + if mi.nlevels == self.nlevels: + mi.names = self.names + return mi + @classmethod def _simple_new( cls, @@ -1753,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - @_performance_tracking def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 49c2c8cf387..3b047ee5ed4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1449,35 +1449,16 @@ def __repr__(self): warnings.simplefilter("ignore", FutureWarning) preprocess = cudf.concat([top, bottom]) else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): + preprocess = self + if isinstance(preprocess.dtype, cudf.CategoricalDtype): min_rows = ( height if pd.get_option("display.min_rows") == 0 else pd.get_option("display.min_rows") ) show_dimensions = pd.get_option("display.show_dimensions") + preprocess = preprocess.copy(deep=False) + preprocess.index = preprocess.index._pandas_repr_compatible() if preprocess.dtype.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -1502,7 +1483,7 @@ def __repr__(self): na_rep=str(cudf.NA), ) else: - output = repr(preprocess.to_pandas()) + output = repr(preprocess._pandas_repr_compatible().to_pandas()) lines = output.split("\n") if isinstance(preprocess.dtype, cudf.CategoricalDtype): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bf0c97adb00..2cb742727cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import textwrap @@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 dtype: timedelta64[ms] """ ), @@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 dtype: timedelta64[ms] """ ), @@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 dtype: timedelta64[ms] """ ), @@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 Name: abc, dtype: timedelta64[ms] """ ), From 2c385c456d71fddb74298871b0918b0fb7ad72f4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:58:58 -0800 Subject: [PATCH 15/26] Convert cudf.Scalar usage to pylibcudf and pyarrow usage (#17686) A lot of `cudf.Scalar` usage is to eventually end up with a device scalar object (`pylibcudf.Scalar`) to pass to a pylibcudf routine. The conversion logic to get there can be achieved by converting to a `pyarrow.Scalar` and using `pylibcudf.interop.from_arrow`. This way we offload a lot of scalar-conversion-logic in `cudf.Scalar` to `pyarrow.Scalar` which can further be converted using the interop method. This PR just tackles some straightforward cases of the above. Another PR will tackle the more involved cases Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17686 --- python/cudf/cudf/api/types.py | 4 +- python/cudf/cudf/core/byte_pair_encoding.py | 5 +- python/cudf/cudf/core/column/column.py | 22 ++- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/lists.py | 14 +- python/cudf/cudf/core/column/numerical.py | 11 +- python/cudf/cudf/core/column/string.py | 190 ++++++++++--------- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 6 +- python/cudf/cudf/core/index.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/core/tokenize_vocabulary.py | 5 +- python/cudf/cudf/core/window/rolling.py | 11 +- python/cudf/cudf/tests/test_list.py | 21 +- 15 files changed, 167 insertions(+), 139 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 9c436dfad18..cad4b1aa72c 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Define common type operations.""" @@ -13,6 +13,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa from pandas.api import types as pd_types import cudf @@ -144,6 +145,7 @@ def is_scalar(val): cudf.Scalar, cudf._lib.scalar.DeviceScalar, cudf.core.tools.datetimes.DateOffset, + pa.Scalar, ), ) or ( pd_types.is_scalar(val) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index b49f5154697..0fe47255368 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: 1 this is it dtype: object """ - sep = cudf.Scalar(separator, dtype="str") return cudf.Series._from_column( - text._column.byte_pair_encoding(self.merge_pairs, sep) + text._column.byte_pair_encoding(self.merge_pairs, separator) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ef815e44d9d..e23ca810065 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -253,8 +253,12 @@ def find_and_replace( def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: plc_column = plc.replace.clamp( self.to_pylibcudf(mode="read"), - cudf.Scalar(lo, self.dtype).device_value.c_value, - cudf.Scalar(hi, self.dtype).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype)) + ), + plc.interop.from_arrow( + pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype)) + ), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -1029,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(cudf.Scalar(rhs.null_count > 0)) + result = result.fillna(rhs.null_count > 0) return result def as_mask(self) -> Buffer: @@ -1995,12 +1999,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - cudf.Scalar( - arbitrary.start, dtype=np.dtype(np.int64) - ).device_value.c_value, - cudf.Scalar( - arbitrary.step, dtype=np.dtype(np.int64) - ).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(arbitrary.start, type=pa.int64()) + ), + plc.interop.from_arrow( + pa.scalar(arbitrary.step, type=pa.int64()) + ), ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 80551e33115..1bde7d27700 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -353,8 +353,8 @@ def is_year_end(self) -> ColumnBase: day_of_year = self.day_of_year leap_dates = self.is_leap_year - leap = day_of_year == cudf.Scalar(366) - non_leap = day_of_year == cudf.Scalar(365) + leap = day_of_year == 366 + non_leap = day_of_year == 365 return leap.copy_if_else(non_leap, leap_dates).fillna(False) @property diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 9c5041df521..6fc2b5d4ca2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -285,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn: with acquire_spill_lock(): plc_column = plc.strings.convert.convert_lists.format_list_column( lc.to_pylibcudf(mode="read"), - cudf.Scalar("None").device_value.c_value, + plc.interop.from_arrow(pa.scalar("None")), separators.to_pylibcudf(mode="read"), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -391,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase: ) @acquire_spill_lock() - def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.contains( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), ) ) @acquire_spill_lock() - def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.index_of( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -569,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: dtype: bool """ return self._return_or_inplace( - self._column.contains_scalar(cudf.Scalar(search_key)) + self._column.contains_scalar(pa.scalar(search_key)) ) def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: @@ -618,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ if is_scalar(search_key): - result = self._column.index_of_scalar(cudf.Scalar(search_key)) + result = self._column.index_of_scalar(pa.scalar(search_key)) else: result = self._column.index_of_column(as_column(search_key)) return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8fe5299fcdd..70103745926 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba.np import numpy_support from typing_extensions import Self @@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn: elif self.dtype.kind == "b": conv_func = functools.partial( plc.strings.convert.convert_booleans.from_booleans, - true_string=cudf.Scalar( - "True", dtype="str" - ).device_value.c_value, - false_string=cudf.Scalar( - "False", dtype="str" - ).device_value.c_value, + true_string=plc.interop.from_arrow(pa.scalar("True")), + false_string=plc.interop.from_arrow(pa.scalar("False")), ) elif self.dtype.kind in {"i", "u"}: conv_func = plc.strings.convert.convert_integers.from_integers diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcdcb789f23..20eded9a27f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -302,8 +302,10 @@ def cat(self, others=None, sep=None, na_rep=None): with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) else: @@ -359,8 +361,10 @@ def cat(self, others=None, sep=None, na_rep=None): ) ] ), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) @@ -522,11 +526,9 @@ def join( with acquire_spill_lock(): plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, - cudf._lib.scalar.DeviceScalar( - "", cudf.dtype("object") - ).c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), + plc.interop.from_arrow(pa.scalar("")), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -547,8 +549,8 @@ def join( plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), sep_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep_na_rep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep_na_rep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -800,14 +802,14 @@ def contains( else: if case is False: input_column = self.lower()._column # type: ignore[union-attr] - plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] + pat_normed = pat.lower() # type: ignore[union-attr] else: input_column = self._column - plc_pat = cudf.Scalar(pat, dtype="str") + pat_normed = pat with acquire_spill_lock(): plc_result = plc.strings.find.contains( input_column.to_pylibcudf(mode="read"), - plc_pat.device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat_normed)), ) result_col = Column.from_pylibcudf(plc_result) else: @@ -892,8 +894,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: with acquire_spill_lock(): plc_result = plc.strings.contains.like( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat, "str").device_value.c_value, - cudf.Scalar(esc, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(esc)), ) result = Column.from_pylibcudf(plc_result) @@ -1071,14 +1073,14 @@ def replace( plc.strings.regex_program.RegexProgram.create( pat, plc.strings.regex_flags.RegexFlags.DEFAULT ), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl)), n, ) else: plc_result = plc.strings.replace.replace( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat).device_value.c_value, - cudf.Scalar(repl).device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(repl)), n, ) result = Column.from_pylibcudf(plc_result) @@ -1194,13 +1196,13 @@ def slice( 2 cm dtype: object """ - param_dtype = np.dtype(np.int32) + param_dtype = pa.int32() with acquire_spill_lock(): plc_result = plc.strings.slice.slice_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(start, param_dtype).device_value.c_value, - cudf.Scalar(stop, param_dtype).device_value.c_value, - cudf.Scalar(step, param_dtype).device_value.c_value, + plc.interop.from_arrow(pa.scalar(start, param_dtype)), + plc.interop.from_arrow(pa.scalar(stop, param_dtype)), + plc.interop.from_arrow(pa.scalar(step, param_dtype)), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -2174,7 +2176,7 @@ def filter_alphanum( plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep else plc.strings.char_types.StringCharacterTypes.ALPHANUM, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, @@ -2318,7 +2320,7 @@ def slice_replace( with acquire_spill_lock(): plc_result = plc.strings.replace.replace_slice( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), start, stop, ) @@ -2499,7 +2501,7 @@ def get_json_object( with acquire_spill_lock(): plc_result = plc.json.get_json_object( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(json_path, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(json_path)), options, ) result = Column.from_pylibcudf(plc_result) @@ -2657,7 +2659,12 @@ def split( if regex is True: data = self._column.split_re(pat, n) else: - data = self._column.split(cudf.Scalar(pat, "str"), n) + data = self._column.split( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2667,7 +2674,7 @@ def split( result_table = self._column.split_record_re(pat, n) else: result_table = self._column.split_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2829,7 +2836,12 @@ def rsplit( if regex is True: data = self._column.rsplit_re(pat, n) else: - data = self._column.rsplit(cudf.Scalar(pat, "str"), n) + data = self._column.rsplit( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2839,7 +2851,7 @@ def rsplit( result_table = self._column.rsplit_record_re(pat, n) else: result_table = self._column.rsplit_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2924,7 +2936,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.partition(cudf.Scalar(sep, "str")), + self._column.partition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -2989,7 +3003,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.rpartition(cudf.Scalar(sep, "str")), + self._column.rpartition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -3303,7 +3319,7 @@ def _strip( plc_result = plc.strings.strip.strip( self._column.to_pylibcudf(mode="read"), side, - cudf.Scalar(to_strip, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -3920,7 +3936,7 @@ def _starts_ends_with( f"{type(pat).__name__}" ) elif is_scalar(pat): - plc_pat = cudf.Scalar(pat, "str").device_value.c_value + plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string())) else: plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( mode="read" @@ -4120,7 +4136,7 @@ def _find( with acquire_spill_lock(): plc_result = method( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sub, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sub, type=pa.string())), start, end, ) @@ -4603,7 +4619,7 @@ def filter_characters( plc.strings.translate.FilterType.KEEP if keep else plc.strings.translate.FilterType.REMOVE, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -4710,10 +4726,10 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: if isinstance(delim, Column): result = self._return_or_inplace( - self._column.tokenize_column(delim), + self._column.tokenize_column(delim), # type: ignore[arg-type] retain_index=False, ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): result = self._return_or_inplace( self._column.tokenize_scalar(delim), retain_index=False, @@ -4851,10 +4867,10 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delim, Column): return self._return_or_inplace( - self._column.count_tokens_column(delim) + self._column.count_tokens_column(delim) # type: ignore[arg-type] ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): return self._return_or_inplace( self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) @@ -5112,7 +5128,7 @@ def replace_tokens( self._column.replace_tokens( targets_column, # type: ignore[arg-type] replacements_column, # type: ignore[arg-type] - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5181,8 +5197,10 @@ def filter_tokens( return self._return_or_inplace( self._column.filter_tokens( min_token_length, - cudf.Scalar(replacement, dtype="str"), - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow( + pa.scalar(replacement, type=pa.string()) + ), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5501,12 +5519,12 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: def _massage_string_arg( value, name, allow_col: bool = False -) -> StringColumn | cudf.Scalar: +) -> StringColumn | plc.Scalar: if isinstance(value, cudf.Scalar): return value if isinstance(value, str): - return cudf.Scalar(value, dtype="str") + return plc.interop.from_arrow(pa.scalar(value, type=pa.string())) allowed_types = ["Scalar"] @@ -5747,8 +5765,8 @@ def sum( with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( result_col.to_pylibcudf(mode="read"), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar(None, type=pa.string())), ) return Column.from_pylibcudf(plc_column).element_indexing(0) else: @@ -5766,7 +5784,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: self.to_pylibcudf(mode="read") ) result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + return (result > np.int8(0)).fillna(False) elif out_dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( @@ -6033,8 +6051,10 @@ def _binaryop( rhs.to_pylibcudf(mode="read"), ] ), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow( + pa.scalar(None, type=pa.string()) + ), ) return Column.from_pylibcudf(plc_column) elif op in { @@ -6120,11 +6140,11 @@ def jaccard_index(self, other: Self, width: int) -> NumericalColumn: return type(self).from_pylibcudf(result) # type: ignore[return-value] @acquire_spill_lock() - def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self: result = plc.nvtext.generate_ngrams.generate_ngrams( self.to_pylibcudf(mode="read"), ngrams, - separator.device_value.c_value, + separator, ) return type(self).from_pylibcudf(result) # type: ignore[return-value] @@ -6160,13 +6180,13 @@ def edit_distance_matrix(self) -> ListColumn: def byte_pair_encoding( self, merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, - separator: cudf.Scalar, + separator: str, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.byte_pair_encode.byte_pair_encoding( self.to_pylibcudf(mode="read"), merge_pairs, - separator.device_value.c_value, + plc.interop.from_arrow(pa.scalar(separator)), ) ) @@ -6174,15 +6194,15 @@ def byte_pair_encoding( def ngrams_tokenize( self, ngrams: int, - delimiter: cudf.Scalar, - separator: cudf.Scalar, + delimiter: plc.Scalar, + separator: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.ngrams_tokenize.ngrams_tokenize( self.to_pylibcudf(mode="read"), ngrams, - delimiter.device_value.c_value, - separator.device_value.c_value, + delimiter, + separator, ) ) @@ -6205,14 +6225,14 @@ def normalize_characters(self, do_lower: bool = True) -> Self: @acquire_spill_lock() def replace_tokens( - self, targets: Self, replacements: Self, delimiter: cudf.Scalar + self, targets: Self, replacements: Self, delimiter: plc.Scalar ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.replace_tokens( self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read"), replacements.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) ) @@ -6220,15 +6240,15 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: cudf.Scalar, - delimiter: cudf.Scalar, + replacement: plc.Scalar, + delimiter: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.filter_tokens( self.to_pylibcudf(mode="read"), min_token_length, - replacement.device_value.c_value, - delimiter.device_value.c_value, + replacement, + delimiter, ) ) @@ -6279,10 +6299,10 @@ def subword_tokenize( return tokens, masks, metadata @acquire_spill_lock() - def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + def tokenize_scalar(self, delimiter: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6296,10 +6316,10 @@ def tokenize_column(self, delimiters: Self) -> Self: ) @acquire_spill_lock() - def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.count_tokens_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6324,25 +6344,25 @@ def character_tokenize(self) -> Self: def tokenize_with_vocabulary( self, vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, - delimiter: cudf.Scalar, + delimiter: str, default_id: int, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_with_vocabulary( self.to_pylibcudf(mode="read"), vocabulary, - delimiter.device_value.c_value, + plc.interop.from_arrow(pa.scalar(delimiter)), default_id, ) ) @acquire_spill_lock() - def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.detokenize( self.to_pylibcudf(mode="read"), indices.to_pylibcudf(mode="read"), - separator.device_value.c_value, + separator, ) ) @@ -6491,23 +6511,23 @@ def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: @acquire_spill_lock() def _split_record( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> Self: plc_column = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] - def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.split_record ) - def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.rsplit_record ) @@ -6515,13 +6535,13 @@ def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: @acquire_spill_lock() def _split( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return dict( @@ -6531,21 +6551,21 @@ def _split( ) ) - def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.split) - def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) @acquire_spill_lock() def _partition( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, method: Callable[[plc.Column, plc.Scalar], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) return dict( enumerate( @@ -6554,12 +6574,12 @@ def _partition( ) ) - def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def partition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.partition ) - def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.rpartition ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 40d36a6ff56..5cea35ac0d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6229,10 +6229,8 @@ def isin(self, values): # TODO: propagate nulls through isin # https://github.com/rapidsai/cudf/issues/7556 - fill_value = cudf.Scalar(False) - def make_false_column_like_self(): - return column.as_column(fill_value, length=len(self), dtype="bool") + return column.as_column(False, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6ae524d6346..17302311a7e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,6 +14,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pylibcudf as plc @@ -45,6 +46,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply +from cudf.utils.dtypes import cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -852,7 +854,9 @@ def _shift( plc.table.Table([col.to_pylibcudf(mode="read") for col in values]), [periods] * len(values), [ - cudf.Scalar(val, dtype=col.dtype).device_value.c_value + plc.interop.from_arrow( + pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype)) + ) for val, col in zip(fill_values, values) ], ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 54635b162bc..b535e8aabd2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2327,8 +2327,7 @@ def microsecond(self) -> Index: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - self._column.millisecond.astype("int32") - * cudf.Scalar(1000, dtype="int32") + self._column.millisecond.astype("int32") * np.int32(1000) ) + self._column.microsecond, name=self.name, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c779e1ebe97..eded681baf0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3255,7 +3255,7 @@ def duplicated( ) distinct = libcudf.column.Column.from_pylibcudf(plc_column) result = copying.scatter( - [cudf.Scalar(False, dtype=bool)], + [cudf.Scalar(False)], distinct, [as_column(True, length=len(self), dtype=bool)], bounds_check=False, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3b047ee5ed4..805f9f9a9f9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4106,8 +4106,8 @@ def microsecond(self) -> Series: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - extra = self.series._column.millisecond.astype("int32") * cudf.Scalar( - 1000, dtype="int32" + extra = self.series._column.millisecond.astype("int32") * np.int32( + 1000 ) return self._return_result_like_self(micro + extra) diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index fb8b9b3131c..58dabc85491 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -42,9 +42,8 @@ def tokenize( """ if delimiter is None: delimiter = "" - delim = cudf.Scalar(delimiter, dtype="str") result = text._column.tokenize_with_vocabulary( - self.vocabulary, delim, default_id + self.vocabulary, delimiter, default_id ) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2f8a6d9e5e7..e2c332f34f5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION +# Copyright (c) 2020-2025, NVIDIA CORPORATION from __future__ import annotations import warnings from typing import TYPE_CHECKING import numba +import numpy as np import pandas as pd from pandas.api.indexers import BaseIndexer @@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) + preceding_window = (idx - start + np.int32(1)).astype("int32") + following_window = (end - idx - np.int32(1)).astype("int32") window = None else: preceding_window = as_column(self.window) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index da0aa5be6f5..b1f81edfc54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import functools import operator @@ -14,6 +14,7 @@ from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES +from cudf.utils.dtypes import cudf_dtype_to_pa_type @pytest.mark.parametrize( @@ -423,7 +424,9 @@ def test_get_ind_sequence(): def test_contains_scalar(data, scalar, expect): sr = cudf.Series(data) expect = cudf.Series(expect) - got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -455,7 +458,9 @@ def test_contains_scalar(data, scalar, expect): def test_contains_null_search_key(data, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -518,12 +523,12 @@ def test_contains_invalid(data, scalar): ), ( [["d", None, "e"], [None, "f"], []], - cudf.Scalar(cudf.NA, "O"), + pa.scalar(None, type=pa.string()), [None, None, None], ), ( [None, [10, 9, 8], [5, 8, None]], - cudf.Scalar(cudf.NA, "int64"), + pa.scalar(None, type=pa.int64()), [None, None, None], ), ], @@ -532,7 +537,11 @@ def test_index(data, search_key, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="int32") if is_scalar(search_key): - got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) + got = sr.list.index( + pa.scalar( + search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) + ) + ) else: got = sr.list.index( cudf.Series(search_key, dtype=sr.dtype.element_type) From 76f1c8ba9f2fd7ab6a6f3fd017ce11dd27963827 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 8 Jan 2025 15:02:10 -0600 Subject: [PATCH 16/26] Use latest ci-conda images (#17690) Use `ci-conda:latest` tags for all jobs. All jobs should now support `ci-conda:latest`, and older pinnings are probably not necessary anymore. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/17690 --- .github/workflows/build.yaml | 2 +- .github/workflows/pr.yaml | 6 +++--- .github/workflows/test.yaml | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fb7182f4133..65aebfb7f8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9d79733703c..e955b8f1f80 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -186,7 +186,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -207,7 +207,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -217,7 +217,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 858352f515d..dc82c17022a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -94,7 +94,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -106,7 +106,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit From cb77046d8baad31f4856c097f7052b3a3858c363 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 8 Jan 2025 21:05:41 -0500 Subject: [PATCH 17/26] Bump Polars version to <1.18 (#17632) This PR upgrades the Polars version to 1.17. It xfails some polars tests due to known issues and adds the `maintain_order` param to joins (not implemented yet). Notable change Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17632 --- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/meta.yaml | 4 +- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 42 ++++++++++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- .../cudf_polars/cudf_polars/testing/plugin.py | 21 ++++++++++ python/cudf_polars/pyproject.toml | 4 +- python/cudf_polars/tests/test_join.py | 11 ++++- 9 files changed, 76 insertions(+), 16 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a4b3f4fe174..6ff9a5f832b 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 7173c955116..e82192b8cdb 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index b6c03dc1bc2..7a0005497df 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.15 + - polars >=1.11,<1.18 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index b0f217a6770..50b4cd3c372 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -747,7 +747,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.15 + - polars>=1.11,<1.18 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1c1d4860eec..fd56329a48e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """ DSL nodes for the LogicalPlan of polars. @@ -34,9 +34,11 @@ from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: - from collections.abc import Callable, Hashable, MutableMapping, Sequence + from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence from typing import Literal + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.typing import Schema @@ -1019,7 +1021,27 @@ class ConditionalJoin(IR): __slots__ = ("ast_predicate", "options", "predicate") _non_child = ("schema", "predicate", "options") predicate: expr.Expr - options: tuple + """Expression predicate to join on""" + options: tuple[ + tuple[ + str, + pl_expr.Operator | Iterable[pl_expr.Operator], + ], + bool, + tuple[int, int] | None, + str, + bool, + Literal["none", "left", "right", "left_right", "right_left"], + ] + """ + tuple of options: + - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions + - join_nulls: do nulls compare equal? + - slice: optional slice to perform after joining. + - suffix: string suffix for right columns if names match + - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any + """ def __init__( self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR @@ -1029,15 +1051,16 @@ def __init__( self.options = options self.children = (left, right) self.ast_predicate = to_ast(predicate) - _, join_nulls, zlice, suffix, coalesce = self.options + _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options # Preconditions from polars assert not join_nulls assert not coalesce + assert maintain_order == "none" if self.ast_predicate is None: raise NotImplementedError( f"Conditional join with predicate {predicate}" ) # pragma: no cover; polars never delivers expressions we can't handle - self._non_child_args = (self.ast_predicate, zlice, suffix) + self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order) @classmethod def do_evaluate( @@ -1045,6 +1068,7 @@ def do_evaluate( predicate: plc.expressions.Expression, zlice: tuple[int, int] | None, suffix: str, + maintain_order: Literal["none", "left", "right", "left_right", "right_left"], left: DataFrame, right: DataFrame, ) -> DataFrame: @@ -1088,6 +1112,7 @@ class Join(IR): tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ] """ tuple of options: @@ -1096,6 +1121,7 @@ class Join(IR): - slice: optional slice to perform after joining. - suffix: string suffix for right columns if names match - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any """ def __init__( @@ -1113,6 +1139,9 @@ def __init__( self.options = options self.children = (left, right) self._non_child_args = (self.left_on, self.right_on, self.options) + # TODO: Implement maintain_order + if options[5] != "none": + raise NotImplementedError("maintain_order not implemented yet") if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -1222,12 +1251,13 @@ def do_evaluate( tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ], left: DataFrame, right: DataFrame, ) -> DataFrame: """Evaluate and return a dataframe.""" - how, join_nulls, zlice, suffix, coalesce = options + how, join_nulls, zlice, suffix, coalesce, _ = options if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 37cf36dc4dd..2138ac0c700 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Translate polars IR representation to ours.""" @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (4, 0): + if (version := self.visitor.version()) >= (4, 3): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 87628242838..c16df320ceb 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-0]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-2]": "Need to add include_file_path to IR", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", @@ -140,6 +145,22 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 5904942aea2..9fb9bbf391e 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.15", + "polars>=1.11,<1.18", "pylibcudf==25.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 2fcbbf21f1c..f1f47bfb9f1 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -53,6 +53,15 @@ def right(): ) +@pytest.mark.parametrize( + "maintain_order", ["left", "left_right", "right_left", "right"] +) +def test_join_maintain_order_param_unsupported(left, right, maintain_order): + q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "join_expr", [ From bf8043337a131b40e54cb0bea157cf406b59a603 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Fri, 10 Jan 2025 02:08:02 +0800 Subject: [PATCH 18/26] Use Numba Config to turn on Pynvjitlink Features (#17628) Numba-cuda 0.0.18+ merged in a new feature and made the old way of patching linker with pynvjitlink's `patch_numba_linker` no longer usable by downstream libraries. The current state of Numba-cuda requires that downstream libraries to enable pynvjitlink features only via `CUDA_ENABLE_PYNVJITLINK` environment variable. A recent PR https://github.com/NVIDIA/numba-cuda/pull/91 makes it so that the features can be turned on by a config variable at runtime. This PR is an integration test with that PR and changing the way how pynvjitlink is enabled in cuDF. It enables cuDF to use Numba-cuda since 0.2.0+ (which contains the config change). Supercedes https://github.com/rapidsai/cudf/pull/17359/ Authors: - Michael Wang (https://github.com/isVoid) - Peter Andreas Entschev (https://github.com/pentschev) - Bradley Dice (https://github.com/bdice) Approvers: - Peter Andreas Entschev (https://github.com/pentschev) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17628 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 4 ++-- dependencies.yaml | 6 +++--- python/cudf/cudf/utils/_numba.py | 6 ++---- python/cudf/pyproject.toml | 4 ++-- python/dask_cudf/pyproject.toml | 2 +- 7 files changed, 12 insertions(+), 14 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 6ff9a5f832b..a8e5018b283 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -55,7 +55,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index e82192b8cdb..6dc99b14f5d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -54,7 +54,7 @@ dependencies: - nbsphinx - ninja - notebook -- numba-cuda>=0.0.13,<0.0.18 +- numba-cuda>=0.2.0,<0.3.0 - numpy>=1.23,<3.0a0 - numpydoc - nvcomp==4.1.0.6 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2c16deeed82..b34496cc256 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -80,7 +80,7 @@ requirements: - typing_extensions >=4.0.0 - pandas >=2.0,<2.2.4dev0 - cupy >=12.0.0 - - numba-cuda >=0.0.13,<0.0.18 + - numba-cuda >=0.2.0,<0.3.0 - numpy >=1.23,<3.0a0 - pyarrow>=14.0.0,<18.0.0a0 - libcudf ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 50b4cd3c372..4672a355c72 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -688,7 +688,7 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - cachetools - - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18 + - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0 - nvtx>=0.2.1 - packaging - rich @@ -810,11 +810,11 @@ dependencies: matrices: - matrix: {dependencies: "oldest"} packages: - - *numba-cuda-dep + - numba-cuda==0.2.0 - pandas==2.0.* - matrix: {dependencies: "latest"} packages: - - numba-cuda==0.0.15 + - *numba-cuda-dep - pandas==2.2.3 - matrix: packages: diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py index d9dde58d998..574170d28c6 100644 --- a/python/cudf/cudf/utils/_numba.py +++ b/python/cudf/cudf/utils/_numba.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. import glob import os @@ -130,9 +130,7 @@ def _setup_numba(): if driver_version < (12, 0): patch_numba_linker_cuda_11() else: - from pynvjitlink.patch import patch_numba_linker - - patch_numba_linker() + numba_config.CUDA_ENABLE_PYNVJITLINK = True class _CUDFNumbaConfig: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 2fdf6b34b8f..c6a5887f85d 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -24,7 +24,7 @@ dependencies = [ "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==25.2.*,>=0.0.0a0", - "numba-cuda>=0.0.13,<0.0.18", + "numba-cuda>=0.2.0,<0.3.0", "numpy>=1.23,<3.0a0", "nvtx>=0.2.1", "packaging", diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index b88816a3d47..5b8b98c2b55 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -47,7 +47,7 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ "dask-cuda==25.2.*,>=0.0.0a0", - "numba-cuda>=0.0.13,<0.0.18", + "numba-cuda>=0.2.0,<0.3.0", "pytest-cov", "pytest-xdist", "pytest<8", From f13d8fc851647edde490989b9f0d7d58728fc9a5 Mon Sep 17 00:00:00 2001 From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com> Date: Thu, 9 Jan 2025 13:59:31 -0500 Subject: [PATCH 19/26] Fix parquet reader list bug (#17699) This fixes a bug in the parquet list reader, where if a row had a list so long that it spanned a whole page, we would skip reading the page entirely. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) Approvers: - https://github.com/nvdbaranec - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17699 --- cpp/src/io/parquet/decode_fixed.cu | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index 9acbe026bb2..32bb3349666 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -961,9 +961,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) return; } - // if we have no work to do (eg, in a skip_rows/num_rows case) in this page. - if (s->num_rows == 0) { return; } - using value_decoder_type = std::conditional_t< split_decode_t, decode_fixed_width_split_values_func, From 231015910bae375077e07c01d2bf70697182ccad Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 9 Jan 2025 16:53:25 -0500 Subject: [PATCH 20/26] Support multithreaded reading of compressed buffers in JSON reader (#17670) Addresses #17638 This PR introduces multithreaded host-side decompression of compressed input buffers passed to the JSON reader, and uses a stream pool to transfer the uncompressed buffers to device. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17670 --- cpp/src/io/json/read_json.cu | 71 +++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 82d8152ca1c..113342e9cbf 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -37,12 +38,25 @@ #include #include +#include +#include + #include namespace cudf::io::json::detail { namespace { +namespace pools { + +BS::thread_pool& tpool() +{ + static BS::thread_pool _tpool(std::thread::hardware_concurrency()); + return _tpool; +} + +} // namespace pools + class compressed_host_buffer_source final : public datasource { public: explicit compressed_host_buffer_source(std::unique_ptr const& src, @@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource { { auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), _dbuf_ptr->size()); - if (comptype == compression_type::GZIP || comptype == compression_type::ZIP || - comptype == compression_type::SNAPPY) { + if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP || + _comptype == compression_type::SNAPPY) { _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer); } else { _decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer); @@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource { return std::make_unique(_decompressed_buffer.data() + offset, count); } - [[nodiscard]] bool supports_device_read() const override { return false; } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + auto& thread_pool = pools::tpool(); + return thread_pool.submit_task([this, offset, size, dst, stream] { + auto hbuf = host_read(offset, size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + return hbuf->size(); + }); + } + + [[nodiscard]] bool supports_device_read() const override { return true; } [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; } @@ -431,6 +460,8 @@ device_span ingest_raw_input(device_span buffer, // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line // delimiter. auto constexpr num_delimiter_chars = 1; + std::vector> thread_tasks; + auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count()); auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); @@ -447,13 +478,17 @@ device_span ingest_raw_input(device_span buffer, auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { + for (std::size_t i = start_source, cur_stream = 0; + i < sources.size() && bytes_read < total_bytes_to_read; + i++) { if (sources[i]->is_empty()) continue; auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); auto destination = reinterpret_cast(buffer.data()) + bytes_read + (num_delimiter_chars * delimiter_map.size()); - if (sources[i]->is_device_read_preferred(data_size)) { - bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); + if (sources[i]->supports_device_read()) { + thread_tasks.emplace_back(sources[i]->device_read_async( + range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()])); + bytes_read += data_size; } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); auto const& h_buffer = h_buffers.back(); @@ -481,6 +516,15 @@ device_span ingest_raw_input(device_span buffer, buffer.data()); } stream.synchronize(); + + if (thread_tasks.size()) { + auto const bytes_read = std::accumulate( + thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) { + return sum + task.get(); + }); + CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy"); + } + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); } @@ -505,10 +549,17 @@ table_with_metadata read_json(host_span> sources, return read_json_impl(sources, reader_opts, stream, mr); std::vector> compressed_sources; - for (size_t i = 0; i < sources.size(); i++) { - compressed_sources.emplace_back( - std::make_unique(sources[i], reader_opts.get_compression())); + std::vector>> thread_tasks; + auto& thread_pool = pools::tpool(); + for (auto& src : sources) { + thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] { + return std::make_unique(src, reader_opts.get_compression()); + })); } + std::transform(thread_tasks.begin(), + thread_tasks.end(), + std::back_inserter(compressed_sources), + [](auto& task) { return task.get(); }); // in read_json_impl, we need the compressed source size to actually be the // uncompressed source size for correct batching return read_json_impl(compressed_sources, reader_opts, stream, mr); From a8a41975b0c1cfaedb7d4461ee027f6f9ff75b0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:16:04 -0800 Subject: [PATCH 21/26] Remove cudf._libs.types.pyx (#17665) Contributes to https://github.com/rapidsai/cudf/issues/17317 1. Moves some Python routines/objects to `cudf/utils/dtypes.py` 2. Moves specific column only routines directly to `cudf/_libs/column.pyx` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17665 --- python/cudf/cudf/_lib/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/column.pxd | 4 +- python/cudf/cudf/_lib/column.pyx | 100 ++++++++-- python/cudf/cudf/_lib/scalar.pyx | 49 ++--- python/cudf/cudf/_lib/types.pxd | 11 -- python/cudf/cudf/_lib/types.pyx | 172 ------------------ python/cudf/cudf/core/_base_index.py | 9 +- .../cudf/cudf/core/_internals/aggregation.py | 4 +- python/cudf/cudf/core/_internals/binaryop.py | 4 +- python/cudf/cudf/core/_internals/unary.py | 4 +- python/cudf/cudf/core/column/categorical.py | 10 +- python/cudf/cudf/core/column/column.py | 27 +-- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/column/string.py | 12 +- python/cudf/cudf/core/copy_types.py | 6 +- python/cudf/cudf/core/dtypes.py | 5 +- python/cudf/cudf/core/groupby/groupby.py | 21 +-- python/cudf/cudf/core/index.py | 6 +- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/join/join.py | 6 +- python/cudf/cudf/core/multiindex.py | 11 +- python/cudf/cudf/core/reshape.py | 9 +- python/cudf/cudf/io/csv.py | 8 +- python/cudf/cudf/io/json.py | 8 +- python/cudf/cudf/io/orc.py | 4 +- python/cudf/cudf/utils/dtypes.py | 66 ++++++- 26 files changed, 251 insertions(+), 316 deletions(-) delete mode 100644 python/cudf/cudf/_lib/types.pxd delete mode 100644 python/cudf/cudf/_lib/types.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ff6fba1c3e8..ec44a6aa8c5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8b1d16f0d85..026c12895e8 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +cdef dtype_from_lists_column_view(column_view cv) +cdef dtype_from_column_view(column_view cv) cdef class Column: cdef public: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f7dcd89ea48..c59bbc0f40c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -19,24 +19,21 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype +from cudf.utils.dtypes import ( + _get_base_dtype, + dtype_to_pylibcudf_type, + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, +) from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr +from libc.stdint cimport uintptr_t, int32_t +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_pylibcudf_type, -) - -from cudf._lib.types import dtype_from_pylibcudf_column - -from pylibcudf cimport DataType as plc_DataType +from pylibcudf cimport DataType as plc_DataType, Column as plc_Column cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index): ) +def dtype_from_pylibcudf_column(plc_Column col not None): + type_ = col.type() + tid = type_.id() + + if tid == pylibcudf.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == pylibcudf.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == pylibcudf.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + +cdef dtype_from_lists_column_view(column_view cv): + # lists_column_view have no default constructor, so we heap + # allocate it to get around Cython's limitation of requiring + # default constructors for stack allocated objects + cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) + cdef column_view child = lv.get()[0].child() + + if child.type().id() == libcudf_types.type_id.LIST: + return cudf.ListDtype(dtype_from_lists_column_view(child)) + else: + return cudf.ListDtype(dtype_from_column_view(child)) + + +cdef dtype_from_column_view(column_view cv): + cdef libcudf_types.type_id tid = cv.type().id() + if tid == libcudf_types.type_id.LIST: + return dtype_from_lists_column_view(cv) + elif tid == libcudf_types.type_id.STRUCT: + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) + elif tid == libcudf_types.type_id.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] + + cdef class Column: """ A Column stores columnar data in device memory. @@ -361,7 +433,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -424,7 +496,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index fd6d0257940..65607c91302 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import copy @@ -14,17 +14,16 @@ import pylibcudf as plc import cudf from cudf.core.dtypes import ListDtype, StructDtype -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar +from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -223,40 +222,22 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef plc_TypeID cdtype_id = self.c_value.type().id() + cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype elif cdtype_id in { - plc_TypeID.DECIMAL32, - plc_TypeID.DECIMAL64, - plc_TypeID.DECIMAL128, + plc.TypeID.DECIMAL32, + plc.TypeID.DECIMAL64, + plc.TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype_id == plc_TypeID.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype_id == plc_TypeID.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == plc_TypeID.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) + elif cdtype_id == plc.TypeID.STRUCT: + self._dtype = StructDtype.from_arrow( + plc.interop.to_arrow(self.c_value).type + ) + elif cdtype_id == plc.TypeID.LIST: + self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type) else: - self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype_id) - ] + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index 18b1d26e4db..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from pylibcudf.libcudf.column.column_view cimport column_view - -ctypedef int32_t underlying_type_t_type_id - -cdef dtype_from_column_view(column_view cv) - -cpdef dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index 777bd070b32..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -import pylibcudf as plc - -import cudf - - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - np.dtype("int8"): plc.types.TypeId.INT8, - np.dtype("int16"): plc.types.TypeId.INT16, - np.dtype("int32"): plc.types.TypeId.INT32, - np.dtype("int64"): plc.types.TypeId.INT64, - np.dtype("uint8"): plc.types.TypeId.UINT8, - np.dtype("uint16"): plc.types.TypeId.UINT16, - np.dtype("uint32"): plc.types.TypeId.UINT32, - np.dtype("uint64"): plc.types.TypeId.UINT64, - np.dtype("float32"): plc.types.TypeId.FLOAT32, - np.dtype("float64"): plc.types.TypeId.FLOAT64, - np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): plc.types.TypeId.STRING, - np.dtype("bool"): plc.types.TypeId.BOOL8, - np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, -} -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - plc_type: np_type - for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() -} -# There's no equivalent to EMPTY in cudf. We translate EMPTY -# columns from libcudf to ``int8`` columns of all nulls in Python. -# ``int8`` is chosen because it uses the least amount of memory. -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") - - -size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") - else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] - - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return plc.DataType(plc.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return plc.DataType(plc.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = plc.TypeId.DECIMAL128 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = plc.TypeId.DECIMAL64 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = plc.TypeId.DECIMAL32 - return plc.DataType(tid, -dtype.scale) - # libcudf types don't support timezones so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f" ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) + gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e23ca810065..30da8727366 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -60,9 +59,11 @@ from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_to_pylibcudf_type, find_common_type, get_time_unit, is_column_like, @@ -874,7 +875,7 @@ def indices_of( value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: @@ -954,7 +955,7 @@ def take( # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) + indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] @@ -1743,9 +1744,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1754,21 +1753,16 @@ def column_empty( cudf.core.column.NumericalColumn( data=as_buffer( rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize + size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize ) ), size=None, - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) + children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -2552,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6fc2b5d4ca2..04b4003c510 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column @@ -22,6 +21,7 @@ from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from collections.abc import Sequence @@ -258,7 +258,7 @@ def from_sequences( offset_col = cast( NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), + column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE), ) # Build ListColumn diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 20eded9a27f..2bee85cb387 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,16 +19,18 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -5611,7 +5613,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype + 0, length=size + 1, dtype=SIZE_TYPE_DTYPE ) children = (offsets,) @@ -5888,7 +5890,7 @@ def as_decimal_column( ) -> cudf.core.column.DecimalBaseColumn: plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( self.to_pylibcudf(mode="read"), - libcudf.types.dtype_to_pylibcudf_type(dtype), + dtype_to_pylibcudf_type(dtype), ) result = Column.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4b6ad59c8e1..aaaf6c7ee4f 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,11 +1,11 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from cudf.core.column import NumericalColumn @@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # Alternately we can have an Optional[Column] and handle None # specially in _gather. self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) + "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE) ) else: if self.column.dtype.kind not in {"i", "u"}: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ed233ba737..ce7fb968069 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import decimal @@ -57,7 +57,8 @@ def dtype(arbitrary): if np_dtype.kind in set("OU"): return np.dtype("object") elif ( - np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + np_dtype + not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 17302311a7e..7bc4b08fc49 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,7 +21,6 @@ import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -46,7 +45,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.dtypes import cudf_dtype_to_pa_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -588,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype + range(len(self.obj)), dtype=SIZE_TYPE_DTYPE ) ] ) @@ -1185,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -1199,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) + to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -1500,11 +1499,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape + SIZE_TYPE_DTYPE.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -1517,7 +1516,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + ).astype(SIZE_TYPE_DTYPE) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1525,7 +1524,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) + indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -1533,7 +1532,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) + indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -1557,7 +1556,7 @@ def sample( indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) + want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b535e8aabd2..0d1bf552982 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -53,6 +52,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=size_type_dtype) + return as_column(i, dtype=SIZE_TYPE_DTYPE) def isin(self, values, level=None): if level is not None and level > 0: @@ -1348,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = as_column( -1, length=len(needle), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index eded681baf0..4c6f8a9c152 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -60,6 +60,7 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -3034,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: NumericalColumn, as_column( range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ), len(self), diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6e965ceca66..ce7edc8fdbe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any @@ -7,7 +7,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap @@ -17,6 +16,7 @@ _IndexIndexer, _match_join_keys, ) +from cudf.utils.dtypes import SIZE_TYPE_DTYPE class Merge: @@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols): # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. key_order = [ - cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) for map_, n, null in zip(maps, lengths, nullify) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e7efd01ca85..64ec099cb39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,6 @@ import cudf import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -34,7 +33,7 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -199,7 +198,7 @@ def __init__( ) if lo == -1: # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min + code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min result_col = level._column.take(code, nullify=True) source_data[i] = result_col._with_type_metadata(level.dtype) @@ -1578,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index: def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code + # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code # pandas uses -1 as missing code pd_codes = ( code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1), column.as_column(-1, length=1), ) for code in self._codes @@ -1903,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = column.as_column( -1, length=len(target), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): return _return_get_indexer_result(result.values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0abd42d4d4e..eedd777aafe 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -12,13 +12,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type if TYPE_CHECKING: from cudf._typing import Dtype @@ -1333,10 +1332,10 @@ def _one_hot_encode_column( else: column = column._get_decategorized_column() # type: ignore[attr-defined] - if column.size * categories.size >= np.iinfo(size_type_dtype).max: + if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max: raise ValueError( "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " + f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing " "size of category" ) result_labels = ( diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 6d617cbf38e..7e8468c8e8a 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import errno @@ -16,11 +16,13 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) from cudf.utils.performance_tracking import _performance_tracking _CSV_HEX_TYPE_MAP = { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ff326e09315..16c7d189dfd 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import os @@ -14,10 +14,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from cudf.core.column import ColumnBase diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f3124552fd1..0ac2950a22b 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -11,11 +11,11 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.index import _index_from_data from cudf.utils import ioutils +from cudf.utils.dtypes import dtype_to_pylibcudf_type try: import ujson as json # type: ignore[import-untyped] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 31a8f4de3b3..9e932acb5fa 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime @@ -11,6 +11,8 @@ import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object +import pylibcudf as plc + import cudf if TYPE_CHECKING: @@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) @@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def dtype_to_pylibcudf_type(dtype) -> plc.DataType: + if isinstance(dtype, cudf.ListDtype): + return plc.DataType(plc.TypeId.LIST) + elif isinstance(dtype, cudf.StructDtype): + return plc.DataType(plc.TypeId.STRUCT) + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type + elif isinstance(dtype, pd.DatetimeTZDtype): + dtype = _get_base_dtype(dtype) + else: + dtype = np.dtype(dtype) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) + + +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, +} +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() +} +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype( + "object" +) +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") + + +SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ From 559cda24e4258da1aa35b7de60f46e8a86b1effa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:18:27 -0800 Subject: [PATCH 22/26] Use 64-bit offsets only if the current strings column output chunk size exceeds threshold (#17693) This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17693 --- cpp/src/io/parquet/reader_impl.cpp | 48 +++++++-------------- cpp/src/io/parquet/reader_impl_chunking.hpp | 5 +-- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c48ff896e33..f9fcca6bb4f 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } - // Compute column string sizes (using page string offsets) for this subpass + // Compute column string sizes (using page string offsets) for this output table chunk col_string_sizes = calculate_page_string_offsets(); - // ensure cumulative column string sizes have been initialized - if (pass.cumulative_col_string_sizes.empty()) { - pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); - } - - // Add to the cumulative column string sizes of this pass - std::transform(pass.cumulative_col_string_sizes.begin(), - pass.cumulative_col_string_sizes.end(), - col_string_sizes.begin(), - pass.cumulative_col_string_sizes.begin(), - std::plus<>{}); - // Check for overflow in cumulative column string sizes of this pass so that the page string // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), - pass.cumulative_col_string_sizes.cend(), + auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), + col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // Mark any chunks for which the cumulative column string size has exceeded the - // large strings threshold - if (has_large_strings) { - for (auto& chunk : pass.chunks) { - auto const idx = chunk.src_col_index; - if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } - } + // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output + // column chunks and the large strings threshold. + for (auto& chunk : pass.chunks) { + auto const idx = chunk.src_col_index; + chunk.is_large_string_col = (col_string_sizes[idx] > threshold); } } @@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data( - col_string_sizes[pass.chunks[c].src_col_index], - pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > - static_cast(strings::detail::get_offset64_threshold()), - _stream); + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], + pass.chunks[c].is_large_string_col, + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { - // need to cap off the string offsets column - auto const sz = static_cast(col_string_sizes[idx]); - if (sz <= strings::detail::get_offset64_threshold()) { + // only if it is not a large strings column + if (col_string_sizes[idx] <= + static_cast(strings::detail::get_offset64_threshold())) { out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); - final_offsets.emplace_back(sz); + final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } } } diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index ca46f198bb8..4a773fbced1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,9 +130,6 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; - // cumulative strings column sizes. - std::vector cumulative_col_string_sizes{}; - int level_type_size{0}; // skip_rows / num_rows for this pass. From fb2413e1505297e737095d97e0732eec52519802 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Jan 2025 10:06:35 -0800 Subject: [PATCH 23/26] Make tests build without relaxed constexpr (#17691) Contributes to https://github.com/rapidsai/cudf/issues/7795 This PR updates tests to build without depending on the relaxed constexpr build option. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17691 --- .../cudf/detail/utilities/integer_utils.hpp | 12 +++-- cpp/include/cudf/utilities/span.hpp | 40 +++++++++------ cpp/src/io/utilities/parsing_utils.cuh | 49 ++++++++++--------- cpp/src/io/utilities/trie.cuh | 8 ++- .../transform/segmented_row_bit_count_test.cu | 4 +- cpp/tests/utilities/column_utilities.cu | 18 ++++--- 6 files changed, 75 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 2e3d71815c0..44a86f1c84f 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ */ #include +#include +#include #include #include @@ -44,13 +46,17 @@ namespace util { * `modulus` is positive. The safety is in regard to rollover. */ template -constexpr S round_up_safe(S number_to_round, S modulus) +CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error); +#else + CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value"); +#endif } return rounded_up; } diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index e7b76946248..b5044a58934 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_basedata() + offset, count, _is_device_accessible}; } @@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_basedata() + offset, count}; } @@ -475,28 +480,28 @@ class base_2dspan { * * @return A pointer to the first element of the span */ - [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); } /** * @brief Returns the size in the span as pair. * * @return pair representing rows and columns size of the span */ - [[nodiscard]] constexpr auto size() const noexcept { return _size; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; } /** * @brief Returns the number of elements in the span. * * @return Number of elements in the span */ - [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); } /** * @brief Checks if the span is empty. * * @return True if the span is empty, false otherwise */ - [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; } /** * @brief Returns a reference to the row-th element of the sequence. @@ -507,7 +512,7 @@ class base_2dspan { * @param row the index of the element to access * @return A reference to the row-th element of the sequence, i.e., `data()[row]` */ - constexpr RowType operator[](size_t row) const + CUDF_HOST_DEVICE constexpr RowType operator[](size_t row) const { return _flat.subspan(row * _size.second, _size.second); } @@ -517,7 +522,10 @@ class base_2dspan { * * @return A flattened span of the 2D span */ - [[nodiscard]] constexpr RowType flat_view() const { return _flat; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr RowType flat_view() const + { + return _flat; + } /** * @brief Construct a 2D span from another 2D span of convertible type diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 75e45a68842..9833dab282e 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,7 +171,10 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } +CUDF_HOST_DEVICE constexpr char to_lower(char const c) +{ + return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; +} /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -515,13 +518,13 @@ struct ConvertFunctor { template and !std::is_same_v and !cudf::is_fixed_point())> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex = false) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex = false) { auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -564,13 +567,13 @@ struct ConvertFunctor { * @brief Dispatch for boolean type types. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -593,13 +596,13 @@ struct ConvertFunctor { * is not valid. In such case, the validity mask is set to zero too. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh index c0efc5b6f20..dbdc4a34277 100644 --- a/cpp/src/io/utilities/trie.cuh +++ b/cpp/src/io/utilities/trie.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,16 +74,14 @@ CUDF_EXPORT trie create_serialized_trie(std::vector const& keys, /* * @brief Searches for a string in a serialized trie. * - * Can be executed on host or device, as long as the data is available - * * @param trie Pointer to the array of nodes that make up the trie * @param key Pointer to the start of the string to find * @param key_len Length of the string to find * * @return Boolean value; true if string is found, false otherwise */ -CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span trie, - device_span key) +__device__ inline bool serialized_trie_contains(device_span trie, + device_span key) { if (trie.empty()) { return false; } if (key.empty()) { return trie.front().is_leaf; } diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu index 652b9053582..0e4f623f0a2 100644 --- a/cpp/tests/transform/segmented_row_bit_count_test.cu +++ b/cpp/tests/transform/segmented_row_bit_count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,7 +74,7 @@ compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type s // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. auto const size_begin = d_sizes + segment_idx * segment_length; - auto const size_end = std::min(size_begin + segment_length, d_sizes + num_rows); + auto const size_end = cuda::std::min(size_begin + segment_length, d_sizes + num_rows); return thrust::reduce(thrust::seq, size_begin, size_end); })); diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index fb9bdeb0b22..6888f26fd16 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include #include @@ -412,14 +414,16 @@ class corresponding_rows_not_equivalent { T const y = rhs.element(rhs_index); // Must handle inf and nan separately - if (std::isinf(x) || std::isinf(y)) { + if (cuda::std::isinf(x) || cuda::std::isinf(y)) { return x != y; // comparison of (inf==inf) returns true - } else if (std::isnan(x) || std::isnan(y)) { - return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false + } else if (cuda::std::isnan(x) || cuda::std::isnan(y)) { + return cuda::std::isnan(x) != + cuda::std::isnan(y); // comparison of (nan==nan) returns false } else { - T const abs_x_minus_y = std::abs(x - y); - return abs_x_minus_y >= std::numeric_limits::min() && - abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * fp_ulps; + T const abs_x_minus_y = cuda::std::abs(x - y); + return abs_x_minus_y >= cuda::std::numeric_limits::min() && + abs_x_minus_y > + cuda::std::numeric_limits::epsilon() * cuda::std::abs(x + y) * fp_ulps; } } else { // if either is null, then the inequality was checked already From dc2a75cba40d38f4a6ba66e652764e96fa6b593d Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Jan 2025 13:22:39 -0500 Subject: [PATCH 24/26] Add special orc test data: timestamp interspersed with null values (#17713) Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17713 --- ...e.timestamp.desynced.snappy.RLEv2.hasNull.orc | Bin 0 -> 5951 bytes ...stamp.desynced.uncompressed.RLEv2.hasNull.orc | Bin 0 -> 6565 bytes python/cudf/cudf/tests/test_orc.py | 6 ++++++ 3 files changed, 6 insertions(+) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..8772f84c3ba2a7942323f49ac28c3e5b172a91fc GIT binary patch literal 5951 zcmaKwdvH_dnZ_T?xR5lJv24X%8VbY$ylTk?@?*+;e|9od$AXd0?qoGRN57280S zG&aHsjtl{0T3Dr7sAwCiY&WdvCaltKsEEN4O+!UD5U~v)ifu$LL?qbAfQ=0HI_b{r z{ZBGzYY!Sz_4)7SG|3=|4MJiM<0FN zSjS?0h5h~`a{X7BdmWT4P*38k=9?dUW4rG+tVN&lqp#cRa1V?5YL@uTum4!Qy>{QtYjU2fOX;~l)?6%~6{T_DgDlxv67x}lc3 zAy@s-6WnDR^h1M&q1}x``^lk0+?Pj9L#LaDCYpz)%)6AIeq7qRrJUNLwrpu>+v4K= z^ofow8?0LfZCiHRx9sQNd&se6v~$bpt}PSYTc-H0z_qosXKOjLRlRy^i{L)jLtCHt z_SOv#Zyj8-b+_P^{qC)Y9@#qj=+@JZZJj8*Z>sktJn>TLcV8;^yrlj+p6z?QHGj`r z`xLK#Eq?WV-d->7z&hUHf8d??0si%A-j(&d*$sTb5BX(2{QEO}!?S$bkN9hzJ?*lAoP>fq!K)e|B)`(=QfX3kYOG0>c)8ZL46-OZae`pntny z`wqe0py0qT&hHeQ__5&JPXt$X31)ZW(mxiKg$iXa7aD$AXnUpbiGSL$Vb6}iy*qaQ zY{&kR6^Hii82$N<)4$j;@yi`kC1LCjmi{VO9to;n3$~PsUB3=K@h`y*uLlPY1b3H) z_eX<=4hBbm6FmKI!HIjsQ*RFA&~WL$50}S=)%Wac`R%alcf(Ko$MA-?hX+L~cmIBP ze|-4Rk>SyIhEI$3O&lGbI<^y|J4@f)S-$K6_3@o8f7t1IZ|4(#+_~XI(e^(T?M)UP zI9YV~&#cz}EIRi=(Unt0v!{y%AF}MJV%eEu!+#aq{!+Z=BUaDZ;{J2R+s_y8O&1@y z!1~T#i%(oEJ~vi;WxRNHg7w`?OUp7#WtW#4K3;13g!TPTm#+P6Y5$d_+dp5rH_Q6r z)uo5OSbE~irRT0Ky)wz#I8`FJUQ(7TkxiEvM3-K?S+XWGxpttO{;m>DQ$45T8cI-LhTNmH>atg+gv1<$c#^Ynk>oYIE z^A4-T>^Ghnd*kNJTO%XaCO2c~^0@N$%$cJjw; zTaNOk%mLG~i8r~kx_952+&x;GxV>j^OS0##Gn0?Hu^tWwW^d?28ip_4j`=@T1Ge9_*iaf6P4L+C8xK zW3y!A@)Kj$oHf*cf90|2jru>0Idj{7I`HdHnt6o}C&wPnz5h=Gzx(8va%10%k=|+k z-huZ%HETES`SaLQ6C1(fN+y zeEh-NGiQ(9y*)X$ba-K6@!Pe0biQz^cW7YM(DB0AhZFNFrpC7om@}*HnV9{j-KCr*k`>cefF5}tn;GvM@@fu#`xG%KN>mP*E@>mn$GtVn$`O~;+;NtYmhvUxYAG&8WDE9H8bQq`YQdE zfy!XP!OC!Dq%v9=tBhABDwCC|%5=e7mD$Q%Wxk4CC9D!viK`@4M8T0NvWlvrtC%Wx zm8Z&E<*V{n1*<|;;i^bgv?^8=uS!%U3r<$0t1?yDs$5mRnq4id7FCN2&QufC+G?_z zs-~-%YIn7#+FNj;+Fu>04pxV%!_|@MXmzYQUU0cOS)Hm*S7)lT)w$}tlr0q&T$PHY z5-B0oN=YdtrKOD2T`(>6N_|qlG$0L1L(;G`B8?WzN#oLlG$~C<)6$GIE6qvsoC29p zCX$I|5*Z=W%19X{qd7}tZkb2smHA|TSwI$)g=Aq_R2GxPWeHhQmXf7q8CjOIQkIvq zO{5M&VX?6kg6F3cn(t2r5E~up**}Dq@PbBB@9z(u#~C ztH>$xHS8K8XI+iBMp8r6XluwCs)nv%YTTS>YP>bR8h=foCRh`y3D-nwqMS`N@tQ#q&e25UpLVb067 z(b`yTyf#srtWDLXYcsW3&c52blC2afMM|+!q9l}BCCPbBNh=wpTj^1Hl|H3k8Bhi} z2bE!EL>X1alyPN3nN+5fX=PTKQ|47{l~5&8iB%F6!8xKLRg{WWF)Fvpqw=bJDnIAA zDyRyn!m5ZWs*0)Ns)Q=ZIjKskGODa9r^>6@YN1-B7IV(13AI*Dswp+CX4G!ANA2ZY zQ2W&Zbx<8rht&~vR2@^tIhWN*bxNI9XVh7BPMz1VHA2o+jaVbm5E`w9)KD5)!)V-` zX^mIo)A%(3O;8ikgf$UOlryJ^YZ98ICZ$PhGMcO=r^#!DT9H<)m1qgAR!eFrEzMn` zb!$CZuhysaYXjP#Hlz)6muaKgm^Q9WXp`EMHm%KQv)q;1ypF9C>O?xRPNE}pS{=!) z(9t?Z=hk_2UY$?p*9CMzu0j{qMIL`r7uCgdaa}@})TMN3u1=TL<#hQvcAcd9iMk}$QJ1dE)Me{(b@_UB zy|7+XFRmx*we@5@RZrJ5_3nC4y_fq)y}v$CAFL16hwCHt(fU|@ocm;bvOZOxuFuqG z>vQ$_26lsxyRJdpAZZ{Pv<+kf)j&5e4Q}o;4c-P{gTEor5NrrFgc~9aQSPROctfHg zxqbl+DKwKh4LLOA!5xBL2)zh;G4vAX3Fx)Zlh9Mp)8Gz6?}pw3y%%~P^nU0A z&0;O>Jy4+9$pAq*lI#4t!;AYjnKK*B)7 zz`)>!!2^RA1|JN57y>W^!955=7={Q8Q5a${#9>Ilkc1%xLmJ$-V93IdgCUPbHX4O! z6roX!MhO}TaF3voL?eYp8jTDZ-Dvcn(TheO8vWoNM`I9;AvA{37(rtcjWIOF(U?GE z65Nw$OrtS_#w;3hXv~9TgA{@kffR##1|$Je3z7s$fuun)Al)E6AidyT0O$UWHK%qXb3*MlFmaj1-JCj0}u! z7`-t1VD!TnfH4ST2*xms5g4Q3&cPUmF#%%|#uSWc7&9&!d^0a~jYrLbDjnlH8vcy+$;X)ACs~ z(`aUGX*mZGXt}G<|nf-d%=5T=8Hu>0CRAbUgX0tM_`W5 zzQ3sB;5~V}$^df;=Jf5rEX-M$b1>)8!bXb_ymk4<4x>ea79xM7A1x$WsJRsev@mFK zgZB(tylC;E#XmPaizb7;xm{cIMk zLbQs|Dn_dWtpr-N^NGV~rO-+*l=q|6jaJXX3kJ0M(CSBP0Ifl^hR_;DYZR?9w8qhz zKx-1MDYT~1nnCO0&F({M9uymt5R?d%7?cDQ0ZI#s1n)IaG$;m?87nBc_A5;KT z5WIt+!k{9cqM%}+;-C_slAuzc(%`)XDhnzHDh~@A79lJmSj4bMU?IRe0t*QX1q%%e z1B)9L4=i3-e6aYzI}S?_mJlpqSR$}QVTr*Kha~|^618G9NO||XQN#R-c_`V(Jnzdfp#t0NwiaFr_s)!-3{I}+P!G^q1})60NR6S z51~Db_6XXe;LV{uj`jrFlW0$&J&pDZ+OufSp*;_N0Xl@}5TQei4hcF4bZF5*qJu&Q z4gL~zxY6N3hZh|_bokK`Kt~WAA#{YnUxtn-I%4REqa%TiBsx;)NTVZzjx6{qdj~u6 zR<>1W64;tv;*Y8n6cW3TxOJu|};iYuuW!Cao!J+L~Sb z)6Co0Hla;q6Wb& zPubJ8Iw3a4mil%9XcJrU1y|j<^(*Zh2hv+aJp`-jwbevAm zNjgQR=?tBvb9CMzbch^chr~fRv<}iiIcWZ{!|m`mybhnk?+7@8j*uhFf7uar#2j%) z!jW{O9BD_!k>&4m5O*9I^&&*&SYn*GtGaiGuxT#%y+T7gk7R8ahIfv;2-HCyQnU@i|KNAdAht^zAiui zcvrA1)D`ZEbVa*jUGc6&SCW6SE8Ugp%68?t^4;uiVYjGT+)Z?AyUA{<)E@yCdDv?pSx6f4Mu^o$5|^XS%c9x$eA^?G*B_I>k;FIcHu_;1aq-F0o7EB3xP* z>7rb;V2R7^^0>S%pUdwGxPq>bD=b*%in?O1xGUjGx>ByRE91%vR=V;%>>go{s7Kr* z=^=WwJ!B8v!}Pd&JU!kXUyr{h&=V9Wdcr-Co@h_3C*G6jN%o|A(gIyiwkOwb literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..f5a1edbb10ee55f131764e18b638c35c8e06b0cf GIT binary patch literal 6565 zcmcI}eRLD|neHQ5Uls;CA_*YU;L0D8Mz%giwk3~b%O2U5JhCl$XcDS4b*gMPRM`fq zq_GiBaAXJ|vxQZfg^KQmD%%^b=uKFq-B1yOBbtVaZXlu!Ac}27K8Q%LkpUYS>~-7S zy}fsvJ@?#m`uk(v=ltI1eV_N8-^`pd=XJ7Hu~5Pn4HUWX2k@b20m#Kd{BcR{6HP8+ z?pb~+cx!*yP#Qb%(`eYB_*AGpce>QY5v#=FMjqz5Ur8<5Q6Tw}=$weTcX{CO?@6WC z-x)rzzT(}nYYd0Q|)KY48aZ$JO&?E>xtU$DOU1^W-Y^aYLw zzhM7?uYQr^_Y$sS^MfCKQ5`RRLFE6R)SpfE9}=!(dvE>j&(!f3kotWc3vK`Wga1qJ zFC_J+ZT}aEg*sk;?@#LZZ_nyaGyNxi=|9o`3I2=Jf4Kjwge&orAOG^->-c|s^1iZr zzAz^SL>w&VLQw!C4;(($2(VU&J`uJRqEL+ei*Tq2#~0)565LgcnG%lZF3xfZN3j$u z?&jF<;jForvvwJ$Z#llag0n};*}sx==swQzFF|=f=khAfOfDn(GIzNQx-zb@oNKS( zuBqg%m7`w4-KOO3A-MZh+(T7puI8Rqb1!SSGui@?uAr10)anPDY6e}kgOBo;t~U%0 z7zcON5ALG|5At6eF%O<<7#wdLoV4szfBa!d^X5`|v(~z~sb#ZE@cl>IHm|pB9md_KeQWWfZwdBz z1^b^69Qqr<@o(dQJ}J1oPB61xDEf|Yxex#Rl+gIJ(EeTFnrDP-d-3x=;kFIJJ$~W- zjlx6h_}kPcXP*;Z-XxqESn}laMOOnN#h}QzS!CZLTJr+l-zw_cCfc@Lv?nOqKZJ8T zM92SDboP6q%R5CgyKwRE3YUip6)zSVzh7v7sqoRi-@bnL_JKXycl}`dzTy=J_ii8g z;r3Jiuzma=w@(&_u`gKiqhM(ysC^~aR3df#IQZ!Q2(EuMIIusst0cTH8a#L)IP#O= zss9xmze_s#`VbBdmHhKiX>3S)*WRX|4!M3d^yt3~t$%Z9K(cbz&xiKKhYlVd8hLBz zlw|Mtk)g?>J20}NyH<0`&H4NWYPW;MTdUPY5v!u zv+osMK3Oz#YO&~jPJ3#x;`CzUZx-8syLin9obEG=`_3-jc5d;W^y2;JIbZ$V;^P+< zpB-I%d2I2_IOiJ|mn_dLQCwPL{BVi=BhI%zUb6O+C4HBdZ2NS{o-F4(SC$<5Y{~J@ zmz=%2 zzM|x{`2k>{jMt+zPE7QgBA z=xaBo-xwagIHX$hE@ zj=#>I(ZBr`zhu*9bJ;~r;mqp9k$WePayIKPzP@yG|!Ub0C0&O9$OTzmI>{olQG^!~o-cSkMbu3i0GKD5X-EIU4G z%h^JG_m&^6++g_Cs57_q`~5%us8LY(P;&I4+`E6@|Fe&dsyFmLAL*GA?&*K$V+*-q z_pe7EA73Bt|Lw;|>o&akucPaxw*Ij1ez}F(aOS1>^@*7Xj?YmLD zf7teRGn?Q0o4NfroYCQjvL_#${K^A!2XE~B$?)S>obLSj4{qOk&&k(@d#{}QdVcoI z+{n#h{ySSgb9!c1o}D{>(-|Ay{r2;JJ!?8Qck1TOpANtJxzjuOSMIs9H&6b{@SC5X zd~$Z<`P-L2ifp8M8R&))vA zPgd1mT>jqNhN-ca`{ysOYR)e^HDj;odF9zvpRQ`pKa`sL!S%6M?^v@IHvjT(8xP!A zcVKeQ6Bc)V>`dd~8xybJ8T`yFU%BdBGqiXR?msK$H&+G!|k~h z#kPxv({rzKpPL^kdhWAjM++9-vh_XMrSBKq;5@slr?2=TcV=YH)w6%uMEEgXL@@^@Jz$G-p4Pz`poe8-gOt=?KNX-?}bl# zhsSz1jSBl3F7|F6Dd`)&)Vp)kwxQvp-rb{*Z+MHd_2tp6{)SI`UmbnbKYXS4^^wwz z4WIYEIr_=Q;fda(W5WK1Ytz3P)AV1Mntp%G`RpS%CW}_Bd-lTg^u@8A&pmQ$`qQ!3 zadCF~+StiWkKCEQH8!#7!u(`OY4O0U|GG*C}D&qy0DwCC|%5-I>GFzFe%*(lQalsY2R4$Vfa#Bvo zX*nZj(bU{)TNC*(NkvMLR%8@e-bzJY$yJJ#5~Wlr zQxZy2N%6{*jFMHll^&&6=~McZ0cDV8pJye4M2H9z5h6;&h&YiTk~{~ICNe~p$PsxJS0z?S zR8rn*6`>+kl!{g{DpuuIc~oBB!z#ZjpbDx&s<0}eimGC&xGJeisnV*9Dyzz=@>Sd_ zG4F{gX_c&ss3NPVD!PiPVyoP|r>eYFzAArJpek4ustQ*{s-nD&Rq?7sRkA8om9ENE zWvg;kdEQ{PxLQ&zt(H|2)nqkQO;M-xc>S%SWI$oWq zPFAO?)76>kEN^dhUd>gD)e^N-EmIR}Qcdw*Q8Q{*?N)o#UbRo{R|nKV-T`%39Z^Ts zF?C#>P$$(Xby}TO=hS%(S0mO)G*XRBL+}o3C=IP)G_1z0@o2mnpT^HSrU`07ny@CK ziE3h+xF(@V@=j>dnv5o^$!YRhu2!s-Xr;W^b1bga(Jo6>o8KAm3| z&;@lNU04^QcJ2E~Crpa=JVzCMBell#v8Uk`zgk41W>nCOxE=^pSot zKnBSW8Rjn~qhySXlL<0OrpPpzA+!9IWM0qJi}ez{R4>yLdQwmE%k+$%)w}f`y;twk z`}F~Rkgw8*^^vbXrjP1l`nW!!PwG?pG+(dJ>T~*h4Yx*IBdL+r$Z80_sfMbdYnU3g z#$Ds7@z(fi{CsOouqIR!u8Gt{YhpF=nnX>K@2E-FWNNZCxte?}w^m#$sg>3ewPY<- zOV={BY^}T2Q|skFTX^5U6jAEE?$?YORk$oT?%z+)MZeY zMO_YcdGH5e5W^sWK?;Km1_A~W1_}lm1_t~g7~C*;VDQ4=gTW6&0EQq8AsE8ozX(GV zh8PTS7!oifVMxJ{h9Lt(7W}<1KsF$Ih0RJ%R zDb&-bXHd_g-i>+>>b!Fp)4(FwrnEFtISXVe-P{gUJt50Hz>J zA(+B2MPQ18KMPYFrUXn$m{KsMVamXig((M99)bdx#V|`?mclH9nShyunSz;ynSo#t z%x;)HFneM4!R&`Q0CN!L5X@l+mckr`IRm_Y-92694n2!b*+Flb=W;6{T74c>`6eQ5BbAu#!{5dsw&!jp#<_)#>(&=5yM z0u4ztq^`ZV5UWQ+_S&@tejbh7oYRO#2^ytnl;wWC5NkpsHKm+EBZEeEYU?31deG=a zqYsUKGzPDG`p_6gW90hCLSPJy@f%-SASWSkpfQcc3>vd&%-xtd1Pd1y@y!SOV39(w z`sPoKu#m7&u+XqDu&}VWr@uD?ix+~2r$1ZZ2Ve=#Fbn)JED>0uGw&|oI0TQ~sxZQm zf+c+`Fat{#mK-d3G;z@+hTw_(BZtr=Llcod+=nI#P4w&vBbrz=xgmH8Og$JL9-OiGBgut zCg&1|&`hJ5nJ?`_vm4Ex`R9yi_MzF2<^Y<5Xbzz{jOHksV`z?}If3RRnp0>_qd9}- zg_GTj<~(RFXfbFBXenqJXaY0|nu6dJ&UVsv=_7wv>$W;bP$3Apu?aeprfE; zpyQwupp&3epwke%0Xhph2RaWc7gjN>5?H0M%3viRI1DQVD-A0HD+{X|Ru8ORSbebi zAvgwW5Y`Z^VOS%uMq!P?8izFjYZ8JJu%=|JqcwuoCbVyxosYs*XFbNZ2?v-^csd(a-ThwTx2)E=|P?FoBQ=&+~l8GF{Av*+8n?c#Px zyHvQkooFZ9sdl=ZX=mHr?Vfh8@ZolUd!Rko9%>J_N7|$9vG%y|vG!zpsy*GFY0tLj z+Vc#T5euJSq>PLq7?PnFnqe50aSNYfyo`_WGXW;ZgqScBVWPr~Oq@wDNhZalnGBO< za!lSKc1Rpjhs;4ZNC)Mh9gJ|u;dXc&UWd=&cLW?kN5~NtzUYWLVve{Y;Yd1Cj-)6_(})U!FISiJRRN+Ux&XV&=C|K=m>X2I-(u1j(A6+ zBiWJaNDJTS$ads9@}1mHai^qH+9~TKgoit+PP&unWINrRo=$J4uhTC))*0*!b%r}5 zozc!%XS_4fnG~MrOm}8Fvz@uld>6M%+$HIfb`f1<7u7{~FC=dIcX>3WSwr|l+)|< zIsMLnGw2LC!_J5^Dx7u3oe5{snR2F`8E4j+bLK?_F0o7ElDcFr!bQ3$7wuw1i(GD( z$K`eTTz*%;6?BDMVbM}o)D?5ZT?tpxm2#zB8CO=c(v|P#c8j|u-O_GZH_=UYQ{7BA z+wJc5bbGsf-Tv-CcTlA24tGboqusIYcz2>Z*`4Z6i}c;u?p$}C<+5T{!b({gONdM? z#nLRpvaFl+uwK^3`bAbY$cET38)2htjE%DiHYswjX*R=V*&Lf+&0Q^CEmvNHvBkoO;}25{Q9;SHPFl2J)-L~zw$!Bf@NU@(QIUkhkrsV>L(kLS`t~=U zdiopRUSA3Yw2+BJg?~ia5AUY_n4kJ@^EV9?**P9Qsn)9XqQ6<_Wnh3O Date: Mon, 13 Jan 2025 08:47:23 -0500 Subject: [PATCH 25/26] Add seed parameter to hash_character_ngrams (#17643) Adds a seed parameter to the `nvtext::hash_character_ngrams` API. Makes this more useful in conjunction with other nvtext APIs. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17643 --- cpp/include/nvtext/detail/generate_ngrams.hpp | 39 ------------------- cpp/include/nvtext/generate_ngrams.hpp | 4 +- cpp/src/text/generate_ngrams.cu | 13 ++++--- cpp/tests/streams/text/ngrams_test.cpp | 4 +- cpp/tests/text/ngrams_tests.cpp | 13 ++++++- 5 files changed, 25 insertions(+), 48 deletions(-) delete mode 100644 cpp/include/nvtext/detail/generate_ngrams.hpp diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp deleted file mode 100644 index ae48fed4e79..00000000000 --- a/cpp/include/nvtext/detail/generate_ngrams.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -namespace CUDF_EXPORT nvtext { -namespace detail { - -/** - * @copydoc hash_character_ngrams(cudf::strings_column_view const&, - * cudf::size_type, rmm::device_async_resource_ref) - * - * @param stream CUDA stream used for allocating/copying device memory and launching kernels - */ -std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, - cudf::size_type ngrams, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace detail -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 54282b8ef3c..b2ba1798a8f 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ std::unique_ptr generate_character_ngrams( * * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param seed The seed value to use with the hash algorithm. Default is 0. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values @@ -124,6 +125,7 @@ std::unique_ptr generate_character_ngrams( std::unique_ptr hash_character_ngrams( cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + uint32_t seed = 0, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 997b0278fe2..33d52ccd570 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include @@ -315,6 +315,7 @@ namespace { */ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + uint32_t seed, cudf::size_type const* d_ngram_offsets, cudf::hash_value_type* d_results) { @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st __shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values auto const ngram_offset = d_ngram_offsets[str_idx]; - auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{seed}; auto const end = d_str.data() + d_str.size_bytes(); auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -400,7 +402,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto d_hashes = hashes->mutable_view().data(); character_ngram_hash_kernel<<>>( - *d_strings, ngrams, d_offsets, d_hashes); + *d_strings, ngrams, seed, d_offsets, d_hashes); return make_lists_column( input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr); @@ -419,11 +421,12 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, stream, mr); + return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp index 221c0a62f3e..47b9ac46d12 100644 --- a/cpp/tests/streams/text/ngrams_test.cpp +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams) auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); nvtext::hash_character_ngrams( - cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); + cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream()); } TEST_F(TextNGramsTest, NgramsTokenize) diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c72c7cfc80e..1a737231389 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash) 2319357747u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::hash_character_ngrams(view, 10, 10); + // clang-format off + LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u, + 2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u}, + LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u, + 3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u, + 4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(TextGenerateNgramsTest, NgramsHashErrors) From 4ec389b4c515e4b3b85d6fd28b1471b1f1de830d Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 13 Jan 2025 09:27:13 -0800 Subject: [PATCH 26/26] Implement `HOST_UDF` aggregation for reduction and segmented reduction (#17645) Following https://github.com/rapidsai/cudf/pull/17592, this enables `HOST_UDF` aggregation in reduction and segmented reduction, allowing to execute a host-side user-defined function (UDF) through libcudf aggregation framework. Closes https://github.com/rapidsai/cudf/issues/16633. Authors: - Nghia Truong (https://github.com/ttnghia) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert (Bobby) Evans (https://github.com/revans2) - Kyle Edwards (https://github.com/KyleFromNVIDIA) - Yunsong Wang (https://github.com/PointKernel) - https://github.com/nvdbaranec URL: https://github.com/rapidsai/cudf/pull/17645 --- cpp/include/cudf/aggregation.hpp | 4 +- cpp/include/cudf/aggregation/host_udf.hpp | 478 +++++++++++------- .../cudf/detail/aggregation/aggregation.hpp | 6 +- cpp/src/groupby/groupby.cu | 9 +- cpp/src/groupby/sort/aggregate.cpp | 81 ++- cpp/src/groupby/sort/host_udf_aggregation.cpp | 48 +- cpp/src/reductions/reductions.cpp | 16 +- cpp/src/reductions/segmented/reductions.cpp | 17 +- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/groupby/host_udf_example_tests.cu | 75 +-- cpp/tests/groupby/host_udf_tests.cpp | 245 ++++----- .../reductions/host_udf_example_tests.cu | 422 ++++++++++++++++ .../main/java/ai/rapids/cudf/Aggregation.java | 2 +- .../ai/rapids/cudf/GroupByAggregation.java | 2 +- .../ai/rapids/cudf/ReductionAggregation.java | 17 +- .../cudf/SegmentedReductionAggregation.java | 11 +- 16 files changed, 941 insertions(+), 495 deletions(-) create mode 100644 cpp/tests/reductions/host_udf_example_tests.cu diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a1b7db5e08a..2b2a660bed7 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -601,7 +601,7 @@ std::unique_ptr make_udf_aggregation(udf_type type, data_type output_type); // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation. -struct host_udf_base; +class host_udf_base; /** * @brief Factory to create a HOST_UDF aggregation. diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp index bbce76dc5f3..451d75137e4 100644 --- a/cpp/include/cudf/aggregation/host_udf.hpp +++ b/cpp/include/cudf/aggregation/host_udf.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,18 +17,16 @@ #pragma once #include +#include #include #include #include -#include #include #include +#include #include -#include -#include -#include /** * @file host_udf.hpp @@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf { */ /** - * @brief The interface for host-based UDF implementation. + * @brief The fundamental interface for host-based UDF implementation. * - * An implementation of host-based UDF needs to be derived from this base class, defining - * its own version of the required functions. In particular: - * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`, - * and `clone` functions. - * - If necessary, the derived class can also override `do_hash` to compute hashing for its - * instance, and `get_required_data` to selectively access to the input data as well as - * intermediate data provided by libcudf. + * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in + * the users' UDF implementation. These functions are required for libcudf aggregation framework + * to perform its operations. + */ +class host_udf_base { + // Declare constructor private to prevent the users from deriving from this class. + private: + host_udf_base() = default; ///< Default constructor + + // Only allow deriving from the structs below. + friend struct reduce_host_udf; + friend struct segmented_reduce_host_udf; + friend struct groupby_host_udf; + + public: + virtual ~host_udf_base() = default; ///< Default destructor + + /** + * @brief Computes hash value of the instance. + * + * Overriding this function is optional when the derived class has data members such that + * each instance needs to be differentiated from each other. + * + * @return The hash value of the instance + */ + [[nodiscard]] virtual std::size_t do_hash() const + { + return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + } + + /** + * @brief Compares two instances of the derived class for equality. + * @param other The other instance to compare with + * @return True if the two instances are equal + */ + [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + + /** + * @brief Clones the instance. + * + * The instances of the derived class should be lightweight for efficient cloning. + * + * @return A new instance cloned from this one + */ + [[nodiscard]] virtual std::unique_ptr clone() const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for reduction contexts. + * + * An implementation of host-based UDF for reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived classes must also define the `operator()` function to perform reduction + * operations. * - * Example of such implementation: + * Example: * @code{.cpp} - * struct my_udf_aggregation : cudf::host_udf_base { + * struct my_udf_aggregation : cudf::reduce_host_udf { * my_udf_aggregation() = default; * - * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, - * // and the result from groupby `MAX` aggregation. - * [[nodiscard]] data_attribute_set_t get_required_data() const override + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * data_type output_dtype, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override * { - * return {groupby_data_attribute::GROUPED_VALUES, - * groupby_data_attribute::GROUP_OFFSETS, - * cudf::make_max_aggregation()}; + * // Perform reduction computation using the input data and return the reduction result. + * // This is where the actual reduction logic is implemented. * } * - * [[nodiscard]] output_t get_empty_output( - * [[maybe_unused]] std::optional output_dtype, - * [[maybe_unused]] rmm::cuda_stream_view stream, - * [[maybe_unused]] rmm::device_async_resource_ref mr) const override + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { - * // This UDF aggregation always returns a column of type INT32. - * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; * } * - * [[nodiscard]] output_t operator()(input_map_t const& input, - * rmm::cuda_stream_view stream, - * rmm::device_async_resource_ref mr) const override + * [[nodiscard]] std::unique_ptr clone() const override * { - * // Perform UDF computation using the input data and return the result. + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct reduce_host_udf : host_udf_base { + /** + * @brief Perform reduction operations. + * + * @param input The input column for reduction + * @param output_dtype The data type for the final output scalar + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation + */ + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for segmented reduction context. + * + * An implementation of host-based UDF for segmented reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived class must also define the `operator()` function to perform segmented reduction. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::segmented_reduce_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * device_span offsets, + * data_type output_dtype, + * null_policy null_handling, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform computation using the input data and return the result. + * // This is where the actual segmented reduction logic is implemented. * } * * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. * return dynamic_cast(&other) != nullptr; * } * @@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf { * }; * @endcode */ -struct host_udf_base { - host_udf_base() = default; - virtual ~host_udf_base() = default; - +struct segmented_reduce_host_udf : host_udf_base { /** - * @brief Define the possible data needed for groupby aggregations. + * @brief Perform segmented reduction operations. * - * Note that only sort-based groupby aggregations are supported. + * @param input The input column for reduction + * @param offsets A list of offsets defining the segments for reduction + * @param output_dtype The data type for the final output column + * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in + * the segment are valid, and if `EXCLUDE` then the reduction result is valid if any + * element in the segment is valid + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - enum class groupby_data_attribute : int32_t { - INPUT_VALUES, ///< The input values column. - GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the - ///< values within each group maintain their original order. - SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and - ///< sorted within each group. - NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys). - GROUP_OFFSETS, ///< The offsets separating groups. - GROUP_LABELS ///< Group labels (which is also the same as group indices). - }; + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + device_span offsets, + data_type output_dtype, + null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; +// Forward declaration. +namespace groupby ::detail { +struct aggregate_result_functor; +} + +/** + * @brief The interface for host-based UDF implementation for groupby aggregation context. + * + * An implementation of host-based UDF for groupby needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such a derived class must also define the functions `get_empty_output()` to return result when + * the input is empty, and ``operator()`` to perform its groupby operations. + * + * During execution, the derived class can access internal data provided by the libcudf groupby + * framework through a set of ``get*`` accessors, as well as calling other built-in groupby + * aggregations through the ``compute_aggregation`` function. + * + * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby + * aggregations require more complex data structure and is not yet supported. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::groupby_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr get_empty_output( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Return a column corresponding to the result when the input values column is empty. + * } + * + * [[nodiscard]] std::unique_ptr operator()( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform UDF computation using the input data and return the result. + * } + * + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override + * { + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; + * } + * + * [[nodiscard]] std::unique_ptr clone() const override + * { + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct groupby_host_udf : host_udf_base { /** - * @brief Describe possible data that may be needed in the derived class for its operations. + * @brief Get the output when the input values column is empty. * - * Such data can be either intermediate data such as sorted values or group labels etc, or the - * results of other aggregations. + * This is called in libcudf when the input values column is empty. In such situations libcudf + * tries to generate the output directly without unnecessarily evaluating the intermediate data. * - * Each derived host-based UDF class may need a different set of data. It is inefficient to - * evaluate and pass down all these possible data at once from libcudf. A solution for that is, - * the derived class can define a subset of data that it needs and libcudf will evaluate - * and pass down only data requested from that set. + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation when the input values column is empty */ - struct data_attribute { - /** - * @brief Hold all possible data types for the input of the aggregation in the derived class. - */ - using value_type = std::variant>; - value_type value; ///< The actual data attribute, wrapped by this struct - ///< as a wrapper is needed to define `hash` and `equal_to` functors. - - data_attribute() = default; ///< Default constructor - data_attribute(data_attribute&&) = default; ///< Move constructor - - /** - * @brief Construct a new data attribute from an aggregation attribute. - * @param value_ An aggregation attribute - */ - template )> - data_attribute(T value_) : value{value_} - { - } - - /** - * @brief Construct a new data attribute from another aggregation request. - * @param value_ An aggregation request - */ - template || - std::is_same_v)> - data_attribute(std::unique_ptr value_) : value{std::move(value_)} - { - CUDF_EXPECTS(std::get>(value) != nullptr, - "Invalid aggregation request."); - if constexpr (std::is_same_v) { - CUDF_EXPECTS( - dynamic_cast(std::get>(value).get()) != nullptr, - "Requesting results from other aggregations is only supported in groupby " - "aggregations."); - } - } - - /** - * @brief Copy constructor. - * @param other The other data attribute to copy from - */ - data_attribute(data_attribute const& other); - - /** - * @brief Hash functor for `data_attribute`. - */ - struct hash { - /** - * @brief Compute the hash value of a data attribute. - * @param attr The data attribute to hash - * @return The hash value of the data attribute - */ - std::size_t operator()(data_attribute const& attr) const; - }; // struct hash - - /** - * @brief Equality comparison functor for `data_attribute`. - */ - struct equal_to { - /** - * @brief Check if two data attributes are equal. - * @param lhs The left-hand side data attribute - * @param rhs The right-hand side data attribute - * @return True if the two data attributes are equal - */ - bool operator()(data_attribute const& lhs, data_attribute const& rhs) const; - }; // struct equal_to - }; // struct data_attribute + [[nodiscard]] virtual std::unique_ptr get_empty_output( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; /** - * @brief Set of attributes for the input data that is needed for computing the aggregation. + * @brief Perform the main groupby computation for the host-based UDF. + * + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - using data_attribute_set_t = - std::unordered_set; + [[nodiscard]] virtual std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; + + private: + // Allow the struct `aggregate_result_functor` to set its private callback variables. + friend struct groupby::detail::aggregate_result_functor; /** - * @brief Return a set of attributes for the data that is needed for computing the aggregation. - * - * The derived class should return the attributes corresponding to only the data that it needs to - * avoid unnecessary computation performed in libcudf. If this function is not overridden, an - * empty set is returned. That means all the data attributes (except results from other - * aggregations in groupby) will be needed. - * - * @return A set of `data_attribute` + * @brief Callback to access the input values column. + */ + std::function callback_input_values; + + /** + * @brief Callback to access the input values grouped according to the input keys for which the + * values within each group maintain their original order. + */ + std::function callback_grouped_values; + + /** + * @brief Callback to access the input values grouped according to the input keys and sorted + * within each group. + */ + std::function callback_sorted_grouped_values; + + /** + * @brief Callback to access the number of groups (i.e., number of distinct keys). */ - [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; } + std::function callback_num_groups; /** - * @brief Hold all possible types of the data that is passed to the derived class for executing - * the aggregation. + * @brief Callback to access the offsets separating groups. */ - using input_data_t = std::variant>; + std::function(void)> callback_group_offsets; /** - * @brief Input to the aggregation, mapping from each data attribute to its actual data. + * @brief Callback to access the group labels (which is also the same as group indices). */ - using input_map_t = std:: - unordered_map; + std::function(void)> callback_group_labels; /** - * @brief Output type of the aggregation. + * @brief Callback to access the result from other groupby aggregations. + */ + std::function)> callback_compute_aggregation; + + protected: + /** + * @brief Access the input values column. * - * Currently only a single type is supported as the output of the aggregation, but it will hold - * more type in the future when reduction is supported. + * @return The input values column. */ - using output_t = std::variant>; + [[nodiscard]] column_view get_input_values() const + { + CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values."); + return callback_input_values(); + } /** - * @brief Get the output when the input values column is empty. + * @brief Access the input values grouped according to the input keys for which the values + * within each group maintain their original order. * - * This is called in libcudf when the input values column is empty. In such situations libcudf - * tries to generate the output directly without unnecessarily evaluating the intermediate data. + * @return The grouped values column. + */ + [[nodiscard]] column_view get_grouped_values() const + { + CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values."); + return callback_grouped_values(); + } + + /** + * @brief Access the input values grouped according to the input keys and sorted within each + * group. * - * @param output_dtype The expected output data type - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation when input values is empty + * @return The sorted grouped values column. */ - [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] column_view get_sorted_grouped_values() const + { + CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values."); + return callback_sorted_grouped_values(); + } /** - * @brief Perform the main computation for the host-based UDF. + * @brief Access the number of groups (i.e., number of distinct keys). * - * @param input The input data needed for performing all computation - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation + * @return The number of groups. */ - [[nodiscard]] virtual output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] size_type get_num_groups() const + { + CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups."); + return callback_num_groups(); + } /** - * @brief Computes hash value of the class's instance. - * @return The hash value of the instance + * @brief Access the offsets separating groups. + * + * @return The array of group offsets. */ - [[nodiscard]] virtual std::size_t do_hash() const + [[nodiscard]] device_span get_group_offsets() const { - return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets."); + return callback_group_offsets(); } /** - * @brief Compares two instances of the derived class for equality. - * @param other The other derived class's instance to compare with - * @return True if the two instances are equal + * @brief Access the group labels (which is also the same as group indices). + * + * @return The array of group labels. */ - [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + [[nodiscard]] device_span get_group_labels() const + { + CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels."); + return callback_group_labels(); + } /** - * @brief Clones the instance. + * @brief Compute a built-in groupby aggregation and access its result. * - * A class derived from `host_udf_base` should not store too much data such that its instances - * remain lightweight for efficient cloning. + * This allows the derived class to call any other built-in groupby aggregations on the same input + * values column and access the output for its operations. * - * @return A new instance cloned from this + * @param other_agg An arbitrary built-in groupby aggregation + * @return A `column_view` object corresponding to the output result of the given aggregation */ - [[nodiscard]] virtual std::unique_ptr clone() const = 0; + [[nodiscard]] column_view compute_aggregation(std::unique_ptr other_agg) const + { + CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation."); + return callback_compute_aggregation(std::move(other_agg)); + } }; /** @} */ // end of group diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index d873e93bd20..5574ed6ea6e 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying host-based UDF aggregation. */ -class host_udf_aggregation final : public groupby_aggregation { +class host_udf_aggregation final : public groupby_aggregation, + public reduce_aggregation, + public segmented_reduce_aggregation { public: std::unique_ptr udf_ptr; diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 4c90cd0eef5..6234148e9fa 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,8 +145,11 @@ struct empty_column_constructor { } if constexpr (k == aggregation::Kind::HOST_UDF) { - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - return std::get>(udf_ptr->get_empty_output(std::nullopt, stream, mr)); + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); + return udf_ptr->get_empty_output(stream, mr); } return make_empty_column(target_type(values.type(), k)); diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 6480070e85a..fb3f7559d64 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -795,58 +795,41 @@ void aggregate_result_functor::operator()(aggregation con { if (cache.has_result(values, agg)) { return; } - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t { - if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; } - // Empty attribute set means everything. - return {host_udf_base::groupby_data_attribute::INPUT_VALUES, - host_udf_base::groupby_data_attribute::GROUPED_VALUES, - host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - host_udf_base::groupby_data_attribute::NUM_GROUPS, - host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - host_udf_base::groupby_data_attribute::GROUP_LABELS}; - }(); + auto const& udf_base_ptr = dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); - // Do not cache udf_input, as the actual input data may change from run to run. - host_udf_base::input_map_t udf_input; - for (auto const& attr : data_attrs) { - CUDF_EXPECTS(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value), - "Invalid input data attribute for HOST_UDF groupby aggregation."); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case host_udf_base::groupby_data_attribute::INPUT_VALUES: - udf_input.emplace(attr, values); - break; - case host_udf_base::groupby_data_attribute::GROUPED_VALUES: - udf_input.emplace(attr, get_grouped_values()); - break; - case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES: - udf_input.emplace(attr, get_sorted_values()); - break; - case host_udf_base::groupby_data_attribute::NUM_GROUPS: - udf_input.emplace(attr, helper.num_groups(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_OFFSETS: - udf_input.emplace(attr, helper.group_offsets(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_LABELS: - udf_input.emplace(attr, helper.group_labels(stream)); - break; - default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation."); - } - } else { // data is result from another aggregation - auto other_agg = std::get>(attr.value)->clone(); + if (!udf_ptr->callback_input_values) { + udf_ptr->callback_input_values = [&]() -> column_view { return values; }; + } + if (!udf_ptr->callback_grouped_values) { + udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); }; + } + if (!udf_ptr->callback_sorted_grouped_values) { + udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); }; + } + if (!udf_ptr->callback_num_groups) { + udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); }; + } + if (!udf_ptr->callback_group_offsets) { + udf_ptr->callback_group_offsets = [&]() -> device_span { + return helper.group_offsets(stream); + }; + } + if (!udf_ptr->callback_group_labels) { + udf_ptr->callback_group_labels = [&]() -> device_span { + return helper.group_labels(stream); + }; + } + if (!udf_ptr->callback_compute_aggregation) { + udf_ptr->callback_compute_aggregation = + [&](std::unique_ptr other_agg) -> column_view { cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg); - auto result = cache.get_result(values, *other_agg); - udf_input.emplace(std::move(other_agg), std::move(result)); - } + return cache.get_result(values, *other_agg); + }; } - auto output = (*udf_ptr)(udf_input, stream, mr); - CUDF_EXPECTS(std::holds_alternative>(output), - "Invalid output type from HOST_UDF groupby aggregation."); - cache.add_result(values, agg, std::get>(std::move(output))); + cache.add_result(values, agg, (*udf_ptr)(stream, mr)); } } // namespace detail diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp index 0da47e17f48..6f1fe80c4bd 100644 --- a/cpp/src/groupby/sort/host_udf_aggregation.cpp +++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,51 +16,9 @@ #include #include -#include namespace cudf { -host_udf_base::data_attribute::data_attribute(data_attribute const& other) - : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; }, - [](std::unique_ptr const& val) { - return value_type{val->clone()}; - }}, - other.value)} -{ -} - -std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const -{ - auto const hash_value = - std::visit(cudf::detail::visitor_overload{ - [](auto const& val) { return std::hash{}(static_cast(val)); }, - [](std::unique_ptr const& val) { return val->do_hash(); }}, - attr.value); - return std::hash{}(attr.value.index()) ^ hash_value; -} - -bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs, - data_attribute const& rhs) const -{ - auto const& lhs_val = lhs.value; - auto const& rhs_val = rhs.value; - if (lhs_val.index() != rhs_val.index()) { return false; } - return std::visit( - cudf::detail::visitor_overload{ - [](auto const& lhs_val, auto const& rhs_val) { - if constexpr (std::is_same_v) { - return lhs_val == rhs_val; - } else { - return false; - } - }, - [](std::unique_ptr const& lhs_val, std::unique_ptr const& rhs_val) { - return lhs_val->is_equal(*rhs_val); - }}, - lhs_val, - rhs_val); -} - namespace detail { host_udf_aggregation::host_udf_aggregation(std::unique_ptr udf_ptr_) @@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation); template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); } // namespace cudf diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 75ebc078930..928625a7e8f 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -144,6 +145,13 @@ struct reduce_dispatch_functor { auto td_agg = static_cast(agg); return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr); } + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction."); + return (*udf_ptr)(col, output_dtype, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported reduction operator"); } } @@ -161,9 +169,11 @@ std::unique_ptr reduce(column_view const& col, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } // Returns default scalar if input column is empty or all null diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 1c3a2b0c0f3..5835bfcf0a1 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include #include #include #include @@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor { } case segmented_reduce_aggregation::NUNIQUE: return segmented_nunique(col, offsets, null_handling, stream, mr); + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction."); + return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported aggregation type."); } } @@ -117,9 +126,11 @@ std::unique_ptr segmented_reduce(column_view const& segmented_values, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } if (segmented_values.is_empty() && offsets.empty()) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 344979e1288..35877ac34b9 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -220,11 +220,12 @@ ConfigureTest( REDUCTIONS_TEST reductions/collect_ops_tests.cpp reductions/ewm_tests.cpp + reductions/host_udf_example_tests.cu + reductions/list_rank_test.cpp reductions/rank_tests.cpp reductions/reduction_tests.cpp reductions/scan_tests.cpp reductions/segmented_reduction_tests.cpp - reductions/list_rank_test.cpp reductions/tdigest_tests.cu GPUS 1 PERCENT 70 diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu index a454bd692fc..e1ded37d8a7 100644 --- a/cpp/tests/groupby/host_udf_example_tests.cu +++ b/cpp/tests/groupby/host_udf_example_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,7 @@ #include #include #include -#include #include -#include #include #include @@ -34,6 +32,9 @@ #include #include +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; + namespace { /** * @brief A host-based UDF implementation for groupby. @@ -41,42 +42,21 @@ namespace { * For each group of values, the aggregation computes * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`. */ -struct host_udf_groupby_example : cudf::host_udf_base { +struct host_udf_groupby_example : cudf::groupby_host_udf { host_udf_groupby_example() = default; - [[nodiscard]] data_attribute_set_t get_required_data() const override - { - // We need grouped values, group offsets, group labels, and also results from groups' - // MAX and SUM aggregations. - return {groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS, - cudf::make_max_aggregation(), - cudf::make_sum_aggregation()}; - } - - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, - [[maybe_unused]] rmm::cuda_stream_view stream, - [[maybe_unused]] rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr get_empty_output( + rmm::cuda_stream_view, rmm::device_async_resource_ref) const override { return cudf::make_empty_column( cudf::data_type{cudf::type_to_id()}); } - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr); - } - - [[nodiscard]] std::size_t do_hash() const override - { - // Just return the same hash for all instances of this class. - return std::size_t{12345}; + auto const values = get_grouped_values(); + return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr); } [[nodiscard]] bool is_equal(host_udf_base const& other) const override @@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base { struct groupby_fn { // Store pointer to the parent class so we can call its functions. - host_udf_groupby_example const* parent; + host_udf_groupby_example const& parent; - // For simplicity, this example only accepts double input and always produces double output. + // For simplicity, this example only accepts a single type input and output. using InputType = double; using OutputType = double; template )> - output_t operator()(Args...) const + std::unique_ptr operator()(Args...) const { CUDF_FAIL("Unsupported input type."); } template )> - output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + std::unique_ptr operator()(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); } + auto const values = parent.get_grouped_values(); + if (values.size() == 0) { return parent.get_empty_output(stream, mr); } - auto const offsets = std::get>( - input.at(groupby_data_attribute::GROUP_OFFSETS)); + auto const offsets = parent.get_group_offsets(); CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); auto const num_groups = static_cast(offsets.size()) - 1; - auto const group_indices = std::get>( - input.at(groupby_data_attribute::GROUP_LABELS)); - auto const group_max = std::get( - input.at(cudf::make_max_aggregation())); - auto const group_sum = std::get( - input.at(cudf::make_sum_aggregation())); + auto const group_indices = parent.get_group_labels(); + auto const group_max = + parent.compute_aggregation(cudf::make_max_aggregation()); + auto const group_sum = + parent.compute_aggregation(cudf::make_sum_aggregation()); auto const values_dv_ptr = cudf::column_device_view::create(values, stream); auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, @@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base { } // namespace -using doubles_col = cudf::test::fixed_width_column_wrapper; -using int32s_col = cudf::test::fixed_width_column_wrapper; - struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {}; TEST_F(HostUDFGroupbyExampleTest, SimpleInput) diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp index 1a0f68c0c6c..17da28cdefc 100644 --- a/cpp/tests/groupby/host_udf_tests.cpp +++ b/cpp/tests/groupby/host_udf_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,178 +26,121 @@ #include namespace { + /** - * @brief A host-based UDF implementation used for unit tests. + * @brief Generate a random aggregation object from {min, max, sum, product}. */ -struct host_udf_test_base : cudf::host_udf_base { +std::unique_ptr get_random_agg() +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distr(1, 4); + switch (distr(gen)) { + case 1: return cudf::make_min_aggregation(); + case 2: return cudf::make_max_aggregation(); + case 3: return cudf::make_sum_aggregation(); + case 4: return cudf::make_product_aggregation(); + default: CUDF_UNREACHABLE("This should not be reached."); + } + return nullptr; +} + +/** + * @brief A host-based UDF implementation used for unit tests for groupby aggregation. + */ +struct host_udf_groupby_test : cudf::groupby_host_udf { int test_location_line; // the location where testing is called bool* test_run; // to check if the test is accidentally skipped - data_attribute_set_t input_attrs; + bool test_other_agg; // test calling other aggregation - host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_) - : test_location_line{test_location_line_}, - test_run{test_run_}, - input_attrs(std::move(input_attrs_)) + host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_) + : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_} { } - [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; } - - // This is the main testing function, which checks for the correctness of input data. - // The rests are just to satisfy the interface. - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::size_t do_hash() const override { return 0; } + [[nodiscard]] bool is_equal(host_udf_base const& other) const override { - SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - - test_data_attributes(input, stream, mr); - - *test_run = true; // test is run successfully - return get_empty_output(std::nullopt, stream, mr); + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(test_location_line, test_run, test_other_agg); } - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, + [[nodiscard]] std::unique_ptr get_empty_output( [[maybe_unused]] rmm::cuda_stream_view stream, [[maybe_unused]] rmm::device_async_resource_ref mr) const override { - // Unused function - dummy output. + // Dummy output. return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); } - [[nodiscard]] std::size_t do_hash() const override { return 0; } - [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; } + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override + { + SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - // The main test function, which must be implemented for each kind of aggregations - // (groupby/reduction/segmented_reduction). - virtual void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; -}; + // Perform tests on types for the groupby data: we must ensure the data corresponding to each + // `groupby_data` enum having the correct type. -/** - * @brief A host-based UDF implementation used for unit tests for groupby aggregation. - */ -struct host_udf_groupby_test : host_udf_test_base { - host_udf_groupby_test(int test_location_line_, - bool* test_run_, - data_attribute_set_t input_attrs_ = {}) - : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_)) - { - } + { + auto const inp_data = get_input_values(); + EXPECT_TRUE((std::is_same_v>)); + } - [[nodiscard]] std::unique_ptr clone() const override - { - return std::make_unique(test_location_line, test_run, input_attrs); - } + { + auto const inp_data = get_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); + } - void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override - { - data_attribute_set_t check_attrs = input_attrs; - if (check_attrs.empty()) { - check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES, - groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::SORTED_GROUPED_VALUES, - groupby_data_attribute::NUM_GROUPS, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS}; + { + auto const inp_data = get_sorted_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); } - EXPECT_EQ(input.size(), check_attrs.size()); - for (auto const& attr : check_attrs) { - EXPECT_TRUE(input.count(attr) > 0); - EXPECT_TRUE(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value)); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case groupby_data_attribute::INPUT_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::SORTED_GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::NUM_GROUPS: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUP_OFFSETS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - case groupby_data_attribute::GROUP_LABELS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - default:; - } - } else { // std::holds_alternative>(attr.value) - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - } + + { + auto const inp_data = get_num_groups(); + EXPECT_TRUE((std::is_same_v>)); } - } -}; -/** - * @brief Get a random subset of input data attributes. - */ -cudf::host_udf_base::data_attribute_set_t get_subset( - cudf::host_udf_base::data_attribute_set_t const& attrs) -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution size_distr(1, attrs.size() - 1); - auto const subset_size = size_distr(gen); - auto const elements = - std::vector(attrs.begin(), attrs.end()); - std::uniform_int_distribution idx_distr(0, attrs.size() - 1); - cudf::host_udf_base::data_attribute_set_t output; - while (output.size() < subset_size) { - output.insert(elements[idx_distr(gen)]); - } - return output; -} + { + auto const inp_data = get_group_offsets(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } -/** - * @brief Generate a random aggregation object from {min, max, sum, product}. - */ -std::unique_ptr get_random_agg() -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution distr(1, 4); - switch (distr(gen)) { - case 1: return cudf::make_min_aggregation(); - case 2: return cudf::make_max_aggregation(); - case 3: return cudf::make_sum_aggregation(); - case 4: return cudf::make_product_aggregation(); - default: CUDF_UNREACHABLE("This should not be reached."); + { + auto const inp_data = get_group_labels(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } + + // Perform tests on type of the result from computing other aggregations. + if (test_other_agg) { + auto const inp_data = compute_aggregation(get_random_agg()); + EXPECT_TRUE((std::is_same_v>)); + } + + *test_run = true; // test is run successfully + return get_empty_output(stream, mr); } - return nullptr; -} +}; } // namespace using int32s_col = cudf::test::fixed_width_column_wrapper; -// Number of randomly testing on the input data attributes. -// For each test, a subset of data attributes will be randomly generated from all the possible input -// data attributes. The input data corresponding to that subset passed from libcudf will be tested -// for correctness. -constexpr int NUM_RANDOM_TESTS = 20; - struct HostUDFTest : cudf::test::BaseFixture {}; -TEST_F(HostUDFTest, GroupbyAllInput) +TEST_F(HostUDFTest, GroupbyBuiltinInput) { bool test_run = false; auto const keys = int32s_col{0, 1, 2}; auto const vals = int32s_col{0, 1, 2}; auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run)); + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ false)); std::vector requests; requests.emplace_back(); @@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } -TEST_F(HostUDFTest, GroupbySomeInput) +TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations) { - auto const keys = int32s_col{0, 1, 2}; - auto const vals = int32s_col{0, 1, 2}; - auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{ - cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES, - cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS, - cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS}; + auto const keys = int32s_col{0, 1, 2}; + auto const vals = int32s_col{0, 1, 2}; + + constexpr int NUM_RANDOM_TESTS = 20; + for (int i = 0; i < NUM_RANDOM_TESTS; ++i) { - bool test_run = false; - auto input_attrs = get_subset(all_attrs); - input_attrs.insert(get_random_agg()); - auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run, std::move(input_attrs))); + bool test_run = false; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ true)); std::vector requests; requests.emplace_back(); @@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } } diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu new file mode 100644 index 00000000000..67b88c5306b --- /dev/null +++ b/cpp/tests/reductions/host_udf_example_tests.cu @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +using int64s_col = cudf::test::fixed_width_column_wrapper; + +namespace { +/** + * @brief A host-based UDF implementation for reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_reduction_example : cudf::reduce_host_udf { + host_udf_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher( + input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + if (input.size() == 0) { + return cudf::make_default_constructed_scalar(output_dtype, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto const result = thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + transform_fn{*input_dv_ptr}, + static_cast(init_value), + thrust::plus<>{}); + + auto output = cudf::make_numeric_scalar(output_dtype, stream, mr); + static_cast*>(output.get())->set_value(result, stream); + return output; + } + + struct transform_fn { + cudf::column_device_view values; + OutputType __device__ operator()(cudf::size_type idx) const + { + if (values.is_null(idx)) { return OutputType{0}; } + auto const val = static_cast(values.element(idx)); + return val * val; + } + }; + }; +}; + +} // namespace + +struct HostUDFReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFReductionExampleTest, SimpleInput) +{ + auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_TRUE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); + auto const result = + static_cast*>(reduced.get())->value(cudf::get_default_stream()); + auto constexpr expected = 55; // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55 + EXPECT_EQ(expected, result); +} + +TEST_F(HostUDFReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_FALSE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); +} + +namespace { + +/** + * @brief A host-based UDF implementation for segmented reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf { + host_udf_segmented_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher(input.type(), + output_dtype, + segmented_reduce_fn{}, + input, + offsets, + output_dtype, + null_handling, + init, + stream, + mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct segmented_reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); + auto const num_segments = static_cast(offsets.size()) - 1; + + if (input.size() == 0) { + if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); } + return cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto output = cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream); + + // Store row index if it is valid, otherwise store a negative value denoting a null row. + rmm::device_uvector valid_idx(num_segments, stream); + + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_segments), + thrust::make_zip_iterator(output->mutable_view().begin(), valid_idx.begin()), + transform_fn{*input_dv_ptr, offsets, static_cast(init_value), null_handling}); + + auto const valid_idx_cv = cudf::column_view{ + cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0}; + return std::move(cudf::gather(cudf::table_view{{output->view()}}, + valid_idx_cv, + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release() + .front()); + } + + struct transform_fn { + cudf::column_device_view values; + cudf::device_span offsets; + OutputType init_value; + cudf::null_policy null_handling; + + thrust::tuple __device__ operator()(cudf::size_type idx) const + { + auto const start = offsets[idx]; + auto const end = offsets[idx + 1]; + + auto constexpr invalid_idx = cuda::std::numeric_limits::lowest(); + if (start == end) { return {OutputType{0}, invalid_idx}; } + + auto sum = init_value; + for (auto i = start; i < end; ++i) { + if (values.is_null(i)) { + if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; } + continue; + } + auto const val = static_cast(values.element(i)); + sum += val * val; + } + auto const segment_size = end - start; + return {static_cast(segment_size) * sum, idx}; + } + }; + }; +}; + +} // namespace + +struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput) +{ + double constexpr null = 0.0; + auto const vals = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0}, + {true, false, true, true, false, true, false, false, true, true}}; + auto const offsets = int32s_col{0, 3, 5, 10}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + + // Test without init value. + { + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // Since init value is not given, it is set to 0. + // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{12, 18, 850}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and include nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{48, 42, 955}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and exclude nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::EXCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{21, 24, 865}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{0, 0, 0, 0}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{{0, 0, 0}, {false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 2276b223740..c07a58ed8a5 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java index 27966ddfdd4..234a9ec1ced 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java index ba8ae379bae..4f047a68f06 100644 --- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,14 +161,14 @@ public static ReductionAggregation median() { /** * Aggregate to compute the specified quantiles. Uses linear interpolation by default. */ - public static ReductionAggregation quantile(double ... quantiles) { + public static ReductionAggregation quantile(double... quantiles) { return new ReductionAggregation(Aggregation.quantile(quantiles)); } /** * Aggregate to compute various quantiles. */ - public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) { + public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) { return new ReductionAggregation(Aggregation.quantile(method, quantiles)); } @@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() { * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. */ public static ReductionAggregation collectSet(NullPolicy nullPolicy, - NullEquality nullEquality, NaNEquality nanEquality) { + NullEquality nullEquality, NaNEquality nanEquality) { return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality)); } @@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality)); } + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new ReductionAggregation instance + */ + public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new ReductionAggregation(Aggregation.hostUDF(wrapper)); + } + /** * Create HistogramAggregation, computing the frequencies for each unique row. * diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java index 7ed150a2fec..18e7d874886 100644 --- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() { public static SegmentedReductionAggregation all() { return new SegmentedReductionAggregation(Aggregation.all()); } + + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new SegmentedReductionAggregation instance + */ + public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper)); + } }