From dc99d2f9bc602e40c7bae894b6759e30a8efdddd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Jan 2025 13:00:30 +0000 Subject: [PATCH 1/9] Introduce some simple benchmarks for rolling window aggregations (#17613) Previously we did not have any benchmarks for rolling aggregations. Introduce some, so we can measure the effects of any performance improvements we might make. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17613 --- cpp/benchmarks/CMakeLists.txt | 7 +- .../rolling/grouped_rolling_sum.cpp | 70 +++++++++ cpp/benchmarks/rolling/rolling_sum.cpp | 134 ++++++++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/rolling/grouped_rolling_sum.cpp create mode 100644 cpp/benchmarks/rolling/rolling_sum.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 749e1b628ee..0ff712c1c77 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) # --------------------------------------------------------------------------------- ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) +# ################################################################################################## +# * rolling benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp new file mode 100644 index 00000000000..04afe5ac661 --- /dev/null +++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + return cudf::sort(cudf::table_view{{keys->view()}}); + }(); + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = cudf::grouped_rolling_window( + keys->view(), vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_grouped_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 28}) + .add_int64_axis("preceding_size", {1, 10}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1}) + .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000}); diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp new file mode 100644 index 00000000000..af9ecd6a26f --- /dev/null +++ b/cpp/benchmarks/rolling/rolling_sum.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +template +void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + + auto vals = [&]() { + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto preceding = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) { + return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto following = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) { + return std::max(-i - 1, std::min(following_size, num_rows - i - 1)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_fixed_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {1, 10, 100}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1, 20}); + +NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_variable_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {10, 100}) + .add_int64_axis("following_size", {2}); From d05b78b13e3cce55e1691de86b0c6020d4f1b0cd Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:54:22 -0500 Subject: [PATCH 2/9] Fix build metrics report format with long placehold filenames (#17679) Truncates filenames that appear as multiple `placehold_placedhold_...` in the Build Metrics Report. Example show here: https://downloads.rapids.ai/ci/cudf/pull-request/17669/0710ad6/cuda12_x86_64.ninja_log.html (requires VPN). Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Muhammad Haseeb (https://github.com/mhaseeb123) URL: https://github.com/rapidsai/cudf/pull/17679 --- cpp/scripts/sort_ninja_log.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py index 42f84e4d0c7..e111367d191 100755 --- a/cpp/scripts/sort_ninja_log.py +++ b/cpp/scripts/sort_ninja_log.py @@ -1,8 +1,9 @@ # -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # import argparse import os +import re import sys import xml.etree.ElementTree as ET from pathlib import Path @@ -144,6 +145,16 @@ def format_file_size(input_size): return file_size_str +def replace_placeholder_patterns(input_string: str) -> str: + pattern = r'(_h_env_placehold)[_placehold]+' + return re.sub(pattern, r'\1...', input_string) + + +# adjust name for display +def format_file_name(name: str) -> str: + return replace_placeholder_patterns(name) + + # Output chart results in HTML format # Builds a standalone html file with no javascript or styles def output_html(entries, sorted_list, cmp_entries, args): @@ -223,7 +234,8 @@ def output_html(entries, sorted_list, cmp_entries, args): print("", end="") # use a slightly smaller, fixed-width font @@ -265,7 +277,8 @@ def output_html(entries, sorted_list, cmp_entries, args): file_size_str = format_file_size(file_size) # output entry row - print("", name, "", sep="", end="") + display_name = format_file_name(name) + print("", display_name, "", sep="", end="") print("", build_time_str, "", sep="", end="") print("", file_size_str, "", sep="", end="") # output diff column From a38ce0a2447e9bca15f3a904c54fe1eba27e5940 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Wed, 8 Jan 2025 08:55:09 -0500 Subject: [PATCH 3/9] Remove pragma GCC diagnostic from source files (#17637) Removes the `#pragma GCC diagnostic` from several source files. These do not seem to be necessary and could suppress useful compile warnings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Nghia Truong (https://github.com/ttnghia) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17637 --- cpp/src/join/mixed_join_kernel.cuh | 4 +--- cpp/src/join/mixed_join_kernels_semi.cu | 4 +--- cpp/src/join/mixed_join_size_kernel.cuh | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh index 368b1fba870..4565626edad 100644 --- a/cpp/src/join/mixed_join_kernel.cuh +++ b/cpp/src/join/mixed_join_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,8 +37,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu index a4ec97af235..4c063b6202e 100644 --- a/cpp/src/join/mixed_join_kernels_semi.cu +++ b/cpp/src/join/mixed_join_kernels_semi.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,8 +30,6 @@ namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) mixed_join_semi(table_device_view left_table, diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh index 98170ed719a..869d05ce4d3 100644 --- a/cpp/src/join/mixed_join_size_kernel.cuh +++ b/cpp/src/join/mixed_join_size_kernel.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,8 +34,6 @@ namespace cudf { namespace detail { namespace cg = cooperative_groups; -#pragma GCC diagnostic ignored "-Wattributes" - template CUDF_KERNEL void __launch_bounds__(block_size) compute_mixed_join_output_size(table_device_view left_table, From da316860211281807e39fadb2a543bcdd6f56abb Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Jan 2025 16:25:14 +0000 Subject: [PATCH 4/9] Skip polars test that can generate timezones that chrono_tz doesn't know (#17694) On Ubuntu 20.04, the tzdata package contains a bunch of symlinks for obsolete timezone names. However, the chrono_tz package that polars uses doesn't read /usr/share/zoneinfo, instead packaging the current zoneinfo database from IANA. Consequently, when this hypothesis-generated test runs and generates timezones from the available zoneinfo-reported timezones, we can get an error from polars that the requested timezone is unknown. Since this is random, just skip it, rather than xfailing. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Matthew Murray (https://github.com/Matt711) - Peter Andreas Entschev (https://github.com/pentschev) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17694 --- .../cudf_polars/cudf_polars/testing/plugin.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 7a759eea2e9..87628242838 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Plugin for running polars test suite setting GPU engine as default.""" @@ -174,6 +174,19 @@ def pytest_configure(config: pytest.Config) -> None: } +TESTS_TO_SKIP: Mapping[str, str] = { + # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks + # for obsolete timezone names. However, the chrono_tz package that + # polars uses doesn't read /usr/share/zoneinfo, instead packaging + # the current zoneinfo database from IANA. Consequently, when this + # hypothesis-generated test runs and generates timezones from the + # available zoneinfo-reported timezones, we can get an error from + # polars that the requested timezone is unknown. + # Since this is random, just skip it, rather than xfailing. + "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", +} + + def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] ) -> None: @@ -182,5 +195,7 @@ def pytest_collection_modifyitems( # Don't xfail tests if running without fallback return for item in items: - if item.nodeid in EXPECTED_FAILURES: + if item.nodeid in TESTS_TO_SKIP: + item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) + elif item.nodeid in EXPECTED_FAILURES: item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) From f017f869829cb05694d195aab0f118357c0dbbd8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 8 Jan 2025 09:30:40 -0800 Subject: [PATCH 5/9] Control pinned memory use with environment variables (#17657) Adds `LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD` and `LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD` environment variables to set the pinned memory optimizations' thresholds. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17657 --- cpp/src/utilities/host_memory.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 73c4567d3a4..94d27d976c3 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/getenv_or.hpp" + #include #include #include @@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) CUDF_EXPORT auto& kernel_pinned_copy_threshold() { // use cudaMemcpyAsync for all pinned copies - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0); return threshold; } @@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0); return threshold; } From f1cb88df8eb7862a82969dfdfd746886198a9b22 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 Jan 2025 09:41:20 -0800 Subject: [PATCH 6/9] Define cudf repr methods on the Column (#17675) Refactors cudf Python objects' repr handling to define the core conversion of "cleaning" nulls at the column level and then rolling up the conversions at the `Frame` and its subclasses level. Notable positive changes: * `repr(cudf.Series)` no longer deep copies * Fixes a bug when `repr(cudf.Series)` with a timedelta type to better match pandas (adjusted unit tests accordingly) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17675 --- python/cudf/cudf/core/_base_index.py | 4 +- python/cudf/cudf/core/column/column.py | 16 +++++- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/lists.py | 13 ++++- python/cudf/cudf/core/column/struct.py | 13 ++++- python/cudf/cudf/core/column/timedelta.py | 2 + python/cudf/cudf/core/dataframe.py | 53 +++---------------- python/cudf/cudf/core/frame.py | 9 +++- python/cudf/cudf/core/index.py | 30 ++--------- python/cudf/cudf/core/indexed_frame.py | 6 +++ python/cudf/cudf/core/multiindex.py | 19 +++---- python/cudf/cudf/core/series.py | 29 ++--------- python/cudf/cudf/tests/test_repr.py | 62 +++++++++++------------ 13 files changed, 117 insertions(+), 143 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c2f3c782d10..2806a1f6c23 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -350,7 +350,7 @@ def names(self, values): self.name = values[0] - def _clean_nulls_from_index(self): + def _pandas_repr_compatible(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 24b657f1c32..ef815e44d9d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -77,6 +77,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.strings import StringColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -92,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } + _PANDAS_NA_REPR = str(pd.NA) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -176,6 +179,17 @@ def __repr__(self): f"dtype: {self.dtype}" ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + if self.has_nulls(): + return self.astype("str").fillna(self._PANDAS_NA_REPR) + return self + def to_pandas( self, *, diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6a4122ebb9..80551e33115 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase): "__rsub__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6283e498842..9c5041df521 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -28,6 +28,7 @@ from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): @@ -67,6 +68,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @cached_property def memory_usage(self): n = super().memory_usage diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ba765b50729..052a68cec98 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -18,6 +18,7 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class StructColumn(ColumnBase): @@ -51,6 +52,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @staticmethod def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: # IntervalDtype is a subclass of StructDtype, so compare types exactly diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 749ab8e837a..302178ea277 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase): "__rfloordiv__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2121511a14..40d36a6ff56 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -1894,7 +1894,7 @@ def astype( dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) - def _clean_renderable_dataframe(self, output): + def _clean_renderable_dataframe(self, output: Self) -> str: """ This method takes in partial/preprocessed dataframe and returns correct representation of it with correct @@ -1929,41 +1929,7 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): + def _get_renderable_dataframe(self) -> Self: """ Takes rows and columns from pandas settings or estimation from size. pulls quadrants based off of some known parameters then style for @@ -1971,9 +1937,9 @@ def _get_renderable_dataframe(self): for printing with the dataframe. """ max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) + if max_rows in {0, None}: + max_rows = len(self) + nrows = max(max_rows, 1) ncols = ( pd.options.display.max_columns if pd.options.display.max_columns @@ -1981,7 +1947,7 @@ def _get_renderable_dataframe(self): ) if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) + output = self elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items # In case of Empty DataFrame with index, Pandas prints @@ -2041,10 +2007,7 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output + return output._pandas_repr_compatible() @_performance_tracking def __repr__(self): diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8f45c6f0115..abf9f7b3686 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -820,6 +820,13 @@ def fillna( inplace=inplace, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + columns = (col._prep_pandas_compat_repr() for col in self._columns) + return self._from_data_like_self( + self._data._from_columns_like_self(columns, verify=False) + ) + @_performance_tracking def _drop_column( self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 85be8d21d27..54635b162bc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase: else: return column.column_empty(0, dtype=self.dtype) - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self def _is_numeric(self) -> bool: @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out - @classmethod @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out + def _from_data_like_self(self, data: MutableMapping) -> Self: + return _index_from_data(data, self.name) @classmethod @_performance_tracking @@ -1494,7 +1488,7 @@ def __repr__(self) -> str: if isinstance(self._values, StringColumn): output = repr(self.to_pandas(nullable=True)) else: - output = repr(self._clean_nulls_from_index().to_pandas()) + output = repr(self._pandas_repr_compatible().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool: hash(item) return item in self._column - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - def any(self) -> bool: return self._column.any() @@ -3615,7 +3595,7 @@ def _is_interval(self) -> bool: def _is_boolean(self) -> bool: return False - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self @property diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9ed74f804b..c779e1ebe97 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4410,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): index_names=self.index.names if keep_index else None, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + result = super()._pandas_repr_compatible() + result.index = self.index._pandas_repr_compatible() + return result + def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..e7efd01ca85 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -361,6 +361,13 @@ def _from_data( name=name, ) + @_performance_tracking + def _from_data_like_self(self, data: MutableMapping) -> Self: + mi = type(self)._from_data(data, name=self.name) + if mi.nlevels == self.nlevels: + mi.names = self.names + return mi + @classmethod def _simple_new( cls, @@ -1753,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - @_performance_tracking def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 49c2c8cf387..3b047ee5ed4 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1449,35 +1449,16 @@ def __repr__(self): warnings.simplefilter("ignore", FutureWarning) preprocess = cudf.concat([top, bottom]) else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): + preprocess = self + if isinstance(preprocess.dtype, cudf.CategoricalDtype): min_rows = ( height if pd.get_option("display.min_rows") == 0 else pd.get_option("display.min_rows") ) show_dimensions = pd.get_option("display.show_dimensions") + preprocess = preprocess.copy(deep=False) + preprocess.index = preprocess.index._pandas_repr_compatible() if preprocess.dtype.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -1502,7 +1483,7 @@ def __repr__(self): na_rep=str(cudf.NA), ) else: - output = repr(preprocess.to_pandas()) + output = repr(preprocess._pandas_repr_compatible().to_pandas()) lines = output.split("\n") if isinstance(preprocess.dtype, cudf.CategoricalDtype): diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bf0c97adb00..2cb742727cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import textwrap @@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 dtype: timedelta64[ms] """ ), @@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 dtype: timedelta64[ms] """ ), @@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 dtype: timedelta64[ms] """ ), @@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 Name: abc, dtype: timedelta64[ms] """ ), From 2c385c456d71fddb74298871b0918b0fb7ad72f4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 8 Jan 2025 12:58:58 -0800 Subject: [PATCH 7/9] Convert cudf.Scalar usage to pylibcudf and pyarrow usage (#17686) A lot of `cudf.Scalar` usage is to eventually end up with a device scalar object (`pylibcudf.Scalar`) to pass to a pylibcudf routine. The conversion logic to get there can be achieved by converting to a `pyarrow.Scalar` and using `pylibcudf.interop.from_arrow`. This way we offload a lot of scalar-conversion-logic in `cudf.Scalar` to `pyarrow.Scalar` which can further be converted using the interop method. This PR just tackles some straightforward cases of the above. Another PR will tackle the more involved cases Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17686 --- python/cudf/cudf/api/types.py | 4 +- python/cudf/cudf/core/byte_pair_encoding.py | 5 +- python/cudf/cudf/core/column/column.py | 22 ++- python/cudf/cudf/core/column/datetime.py | 4 +- python/cudf/cudf/core/column/lists.py | 14 +- python/cudf/cudf/core/column/numerical.py | 11 +- python/cudf/cudf/core/column/string.py | 190 ++++++++++--------- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/groupby/groupby.py | 6 +- python/cudf/cudf/core/index.py | 3 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/series.py | 4 +- python/cudf/cudf/core/tokenize_vocabulary.py | 5 +- python/cudf/cudf/core/window/rolling.py | 11 +- python/cudf/cudf/tests/test_list.py | 21 +- 15 files changed, 167 insertions(+), 139 deletions(-) diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 9c436dfad18..cad4b1aa72c 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Define common type operations.""" @@ -13,6 +13,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa from pandas.api import types as pd_types import cudf @@ -144,6 +145,7 @@ def is_scalar(val): cudf.Scalar, cudf._lib.scalar.DeviceScalar, cudf.core.tools.datetimes.DateOffset, + pa.Scalar, ), ) or ( pd_types.is_scalar(val) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index b49f5154697..0fe47255368 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: 1 this is it dtype: object """ - sep = cudf.Scalar(separator, dtype="str") return cudf.Series._from_column( - text._column.byte_pair_encoding(self.merge_pairs, sep) + text._column.byte_pair_encoding(self.merge_pairs, separator) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index ef815e44d9d..e23ca810065 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -253,8 +253,12 @@ def find_and_replace( def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: plc_column = plc.replace.clamp( self.to_pylibcudf(mode="read"), - cudf.Scalar(lo, self.dtype).device_value.c_value, - cudf.Scalar(hi, self.dtype).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype)) + ), + plc.interop.from_arrow( + pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype)) + ), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -1029,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(cudf.Scalar(rhs.null_count > 0)) + result = result.fillna(rhs.null_count > 0) return result def as_mask(self) -> Buffer: @@ -1995,12 +1999,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - cudf.Scalar( - arbitrary.start, dtype=np.dtype(np.int64) - ).device_value.c_value, - cudf.Scalar( - arbitrary.step, dtype=np.dtype(np.int64) - ).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(arbitrary.start, type=pa.int64()) + ), + plc.interop.from_arrow( + pa.scalar(arbitrary.step, type=pa.int64()) + ), ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 80551e33115..1bde7d27700 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -353,8 +353,8 @@ def is_year_end(self) -> ColumnBase: day_of_year = self.day_of_year leap_dates = self.is_leap_year - leap = day_of_year == cudf.Scalar(366) - non_leap = day_of_year == cudf.Scalar(365) + leap = day_of_year == 366 + non_leap = day_of_year == 365 return leap.copy_if_else(non_leap, leap_dates).fillna(False) @property diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 9c5041df521..6fc2b5d4ca2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -285,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn: with acquire_spill_lock(): plc_column = plc.strings.convert.convert_lists.format_list_column( lc.to_pylibcudf(mode="read"), - cudf.Scalar("None").device_value.c_value, + plc.interop.from_arrow(pa.scalar("None")), separators.to_pylibcudf(mode="read"), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -391,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase: ) @acquire_spill_lock() - def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.contains( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), ) ) @acquire_spill_lock() - def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.index_of( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -569,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: dtype: bool """ return self._return_or_inplace( - self._column.contains_scalar(cudf.Scalar(search_key)) + self._column.contains_scalar(pa.scalar(search_key)) ) def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: @@ -618,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ if is_scalar(search_key): - result = self._column.index_of_scalar(cudf.Scalar(search_key)) + result = self._column.index_of_scalar(pa.scalar(search_key)) else: result = self._column.index_of_column(as_column(search_key)) return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8fe5299fcdd..70103745926 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba.np import numpy_support from typing_extensions import Self @@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn: elif self.dtype.kind == "b": conv_func = functools.partial( plc.strings.convert.convert_booleans.from_booleans, - true_string=cudf.Scalar( - "True", dtype="str" - ).device_value.c_value, - false_string=cudf.Scalar( - "False", dtype="str" - ).device_value.c_value, + true_string=plc.interop.from_arrow(pa.scalar("True")), + false_string=plc.interop.from_arrow(pa.scalar("False")), ) elif self.dtype.kind in {"i", "u"}: conv_func = plc.strings.convert.convert_integers.from_integers diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcdcb789f23..20eded9a27f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -302,8 +302,10 @@ def cat(self, others=None, sep=None, na_rep=None): with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) else: @@ -359,8 +361,10 @@ def cat(self, others=None, sep=None, na_rep=None): ) ] ), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) @@ -522,11 +526,9 @@ def join( with acquire_spill_lock(): plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, - cudf._lib.scalar.DeviceScalar( - "", cudf.dtype("object") - ).c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), + plc.interop.from_arrow(pa.scalar("")), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -547,8 +549,8 @@ def join( plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), sep_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep_na_rep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep_na_rep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -800,14 +802,14 @@ def contains( else: if case is False: input_column = self.lower()._column # type: ignore[union-attr] - plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] + pat_normed = pat.lower() # type: ignore[union-attr] else: input_column = self._column - plc_pat = cudf.Scalar(pat, dtype="str") + pat_normed = pat with acquire_spill_lock(): plc_result = plc.strings.find.contains( input_column.to_pylibcudf(mode="read"), - plc_pat.device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat_normed)), ) result_col = Column.from_pylibcudf(plc_result) else: @@ -892,8 +894,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: with acquire_spill_lock(): plc_result = plc.strings.contains.like( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat, "str").device_value.c_value, - cudf.Scalar(esc, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(esc)), ) result = Column.from_pylibcudf(plc_result) @@ -1071,14 +1073,14 @@ def replace( plc.strings.regex_program.RegexProgram.create( pat, plc.strings.regex_flags.RegexFlags.DEFAULT ), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl)), n, ) else: plc_result = plc.strings.replace.replace( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat).device_value.c_value, - cudf.Scalar(repl).device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(repl)), n, ) result = Column.from_pylibcudf(plc_result) @@ -1194,13 +1196,13 @@ def slice( 2 cm dtype: object """ - param_dtype = np.dtype(np.int32) + param_dtype = pa.int32() with acquire_spill_lock(): plc_result = plc.strings.slice.slice_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(start, param_dtype).device_value.c_value, - cudf.Scalar(stop, param_dtype).device_value.c_value, - cudf.Scalar(step, param_dtype).device_value.c_value, + plc.interop.from_arrow(pa.scalar(start, param_dtype)), + plc.interop.from_arrow(pa.scalar(stop, param_dtype)), + plc.interop.from_arrow(pa.scalar(step, param_dtype)), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -2174,7 +2176,7 @@ def filter_alphanum( plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep else plc.strings.char_types.StringCharacterTypes.ALPHANUM, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, @@ -2318,7 +2320,7 @@ def slice_replace( with acquire_spill_lock(): plc_result = plc.strings.replace.replace_slice( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), start, stop, ) @@ -2499,7 +2501,7 @@ def get_json_object( with acquire_spill_lock(): plc_result = plc.json.get_json_object( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(json_path, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(json_path)), options, ) result = Column.from_pylibcudf(plc_result) @@ -2657,7 +2659,12 @@ def split( if regex is True: data = self._column.split_re(pat, n) else: - data = self._column.split(cudf.Scalar(pat, "str"), n) + data = self._column.split( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2667,7 +2674,7 @@ def split( result_table = self._column.split_record_re(pat, n) else: result_table = self._column.split_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2829,7 +2836,12 @@ def rsplit( if regex is True: data = self._column.rsplit_re(pat, n) else: - data = self._column.rsplit(cudf.Scalar(pat, "str"), n) + data = self._column.rsplit( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2839,7 +2851,7 @@ def rsplit( result_table = self._column.rsplit_record_re(pat, n) else: result_table = self._column.rsplit_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2924,7 +2936,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.partition(cudf.Scalar(sep, "str")), + self._column.partition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -2989,7 +3003,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.rpartition(cudf.Scalar(sep, "str")), + self._column.rpartition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -3303,7 +3319,7 @@ def _strip( plc_result = plc.strings.strip.strip( self._column.to_pylibcudf(mode="read"), side, - cudf.Scalar(to_strip, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -3920,7 +3936,7 @@ def _starts_ends_with( f"{type(pat).__name__}" ) elif is_scalar(pat): - plc_pat = cudf.Scalar(pat, "str").device_value.c_value + plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string())) else: plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( mode="read" @@ -4120,7 +4136,7 @@ def _find( with acquire_spill_lock(): plc_result = method( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sub, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sub, type=pa.string())), start, end, ) @@ -4603,7 +4619,7 @@ def filter_characters( plc.strings.translate.FilterType.KEEP if keep else plc.strings.translate.FilterType.REMOVE, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -4710,10 +4726,10 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: if isinstance(delim, Column): result = self._return_or_inplace( - self._column.tokenize_column(delim), + self._column.tokenize_column(delim), # type: ignore[arg-type] retain_index=False, ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): result = self._return_or_inplace( self._column.tokenize_scalar(delim), retain_index=False, @@ -4851,10 +4867,10 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delim, Column): return self._return_or_inplace( - self._column.count_tokens_column(delim) + self._column.count_tokens_column(delim) # type: ignore[arg-type] ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): return self._return_or_inplace( self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) @@ -5112,7 +5128,7 @@ def replace_tokens( self._column.replace_tokens( targets_column, # type: ignore[arg-type] replacements_column, # type: ignore[arg-type] - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5181,8 +5197,10 @@ def filter_tokens( return self._return_or_inplace( self._column.filter_tokens( min_token_length, - cudf.Scalar(replacement, dtype="str"), - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow( + pa.scalar(replacement, type=pa.string()) + ), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5501,12 +5519,12 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: def _massage_string_arg( value, name, allow_col: bool = False -) -> StringColumn | cudf.Scalar: +) -> StringColumn | plc.Scalar: if isinstance(value, cudf.Scalar): return value if isinstance(value, str): - return cudf.Scalar(value, dtype="str") + return plc.interop.from_arrow(pa.scalar(value, type=pa.string())) allowed_types = ["Scalar"] @@ -5747,8 +5765,8 @@ def sum( with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( result_col.to_pylibcudf(mode="read"), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar(None, type=pa.string())), ) return Column.from_pylibcudf(plc_column).element_indexing(0) else: @@ -5766,7 +5784,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: self.to_pylibcudf(mode="read") ) result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + return (result > np.int8(0)).fillna(False) elif out_dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( @@ -6033,8 +6051,10 @@ def _binaryop( rhs.to_pylibcudf(mode="read"), ] ), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow( + pa.scalar(None, type=pa.string()) + ), ) return Column.from_pylibcudf(plc_column) elif op in { @@ -6120,11 +6140,11 @@ def jaccard_index(self, other: Self, width: int) -> NumericalColumn: return type(self).from_pylibcudf(result) # type: ignore[return-value] @acquire_spill_lock() - def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self: result = plc.nvtext.generate_ngrams.generate_ngrams( self.to_pylibcudf(mode="read"), ngrams, - separator.device_value.c_value, + separator, ) return type(self).from_pylibcudf(result) # type: ignore[return-value] @@ -6160,13 +6180,13 @@ def edit_distance_matrix(self) -> ListColumn: def byte_pair_encoding( self, merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, - separator: cudf.Scalar, + separator: str, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.byte_pair_encode.byte_pair_encoding( self.to_pylibcudf(mode="read"), merge_pairs, - separator.device_value.c_value, + plc.interop.from_arrow(pa.scalar(separator)), ) ) @@ -6174,15 +6194,15 @@ def byte_pair_encoding( def ngrams_tokenize( self, ngrams: int, - delimiter: cudf.Scalar, - separator: cudf.Scalar, + delimiter: plc.Scalar, + separator: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.ngrams_tokenize.ngrams_tokenize( self.to_pylibcudf(mode="read"), ngrams, - delimiter.device_value.c_value, - separator.device_value.c_value, + delimiter, + separator, ) ) @@ -6205,14 +6225,14 @@ def normalize_characters(self, do_lower: bool = True) -> Self: @acquire_spill_lock() def replace_tokens( - self, targets: Self, replacements: Self, delimiter: cudf.Scalar + self, targets: Self, replacements: Self, delimiter: plc.Scalar ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.replace_tokens( self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read"), replacements.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) ) @@ -6220,15 +6240,15 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: cudf.Scalar, - delimiter: cudf.Scalar, + replacement: plc.Scalar, + delimiter: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.filter_tokens( self.to_pylibcudf(mode="read"), min_token_length, - replacement.device_value.c_value, - delimiter.device_value.c_value, + replacement, + delimiter, ) ) @@ -6279,10 +6299,10 @@ def subword_tokenize( return tokens, masks, metadata @acquire_spill_lock() - def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + def tokenize_scalar(self, delimiter: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6296,10 +6316,10 @@ def tokenize_column(self, delimiters: Self) -> Self: ) @acquire_spill_lock() - def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.count_tokens_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6324,25 +6344,25 @@ def character_tokenize(self) -> Self: def tokenize_with_vocabulary( self, vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, - delimiter: cudf.Scalar, + delimiter: str, default_id: int, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_with_vocabulary( self.to_pylibcudf(mode="read"), vocabulary, - delimiter.device_value.c_value, + plc.interop.from_arrow(pa.scalar(delimiter)), default_id, ) ) @acquire_spill_lock() - def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.detokenize( self.to_pylibcudf(mode="read"), indices.to_pylibcudf(mode="read"), - separator.device_value.c_value, + separator, ) ) @@ -6491,23 +6511,23 @@ def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: @acquire_spill_lock() def _split_record( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> Self: plc_column = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] - def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.split_record ) - def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.rsplit_record ) @@ -6515,13 +6535,13 @@ def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: @acquire_spill_lock() def _split( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return dict( @@ -6531,21 +6551,21 @@ def _split( ) ) - def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.split) - def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) @acquire_spill_lock() def _partition( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, method: Callable[[plc.Column, plc.Scalar], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) return dict( enumerate( @@ -6554,12 +6574,12 @@ def _partition( ) ) - def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def partition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.partition ) - def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.rpartition ) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 40d36a6ff56..5cea35ac0d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6229,10 +6229,8 @@ def isin(self, values): # TODO: propagate nulls through isin # https://github.com/rapidsai/cudf/issues/7556 - fill_value = cudf.Scalar(False) - def make_false_column_like_self(): - return column.as_column(fill_value, length=len(self), dtype="bool") + return column.as_column(False, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6ae524d6346..17302311a7e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,6 +14,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pylibcudf as plc @@ -45,6 +46,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply +from cudf.utils.dtypes import cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -852,7 +854,9 @@ def _shift( plc.table.Table([col.to_pylibcudf(mode="read") for col in values]), [periods] * len(values), [ - cudf.Scalar(val, dtype=col.dtype).device_value.c_value + plc.interop.from_arrow( + pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype)) + ) for val, col in zip(fill_values, values) ], ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 54635b162bc..b535e8aabd2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -2327,8 +2327,7 @@ def microsecond(self) -> Index: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - self._column.millisecond.astype("int32") - * cudf.Scalar(1000, dtype="int32") + self._column.millisecond.astype("int32") * np.int32(1000) ) + self._column.microsecond, name=self.name, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index c779e1ebe97..eded681baf0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3255,7 +3255,7 @@ def duplicated( ) distinct = libcudf.column.Column.from_pylibcudf(plc_column) result = copying.scatter( - [cudf.Scalar(False, dtype=bool)], + [cudf.Scalar(False)], distinct, [as_column(True, length=len(self), dtype=bool)], bounds_check=False, diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 3b047ee5ed4..805f9f9a9f9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -4106,8 +4106,8 @@ def microsecond(self) -> Series: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - extra = self.series._column.millisecond.astype("int32") * cudf.Scalar( - 1000, dtype="int32" + extra = self.series._column.millisecond.astype("int32") * np.int32( + 1000 ) return self._return_result_like_self(micro + extra) diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index fb8b9b3131c..58dabc85491 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -42,9 +42,8 @@ def tokenize( """ if delimiter is None: delimiter = "" - delim = cudf.Scalar(delimiter, dtype="str") result = text._column.tokenize_with_vocabulary( - self.vocabulary, delim, default_id + self.vocabulary, delimiter, default_id ) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2f8a6d9e5e7..e2c332f34f5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION +# Copyright (c) 2020-2025, NVIDIA CORPORATION from __future__ import annotations import warnings from typing import TYPE_CHECKING import numba +import numpy as np import pandas as pd from pandas.api.indexers import BaseIndexer @@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) + preceding_window = (idx - start + np.int32(1)).astype("int32") + following_window = (end - idx - np.int32(1)).astype("int32") window = None else: preceding_window = as_column(self.window) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index da0aa5be6f5..b1f81edfc54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import functools import operator @@ -14,6 +14,7 @@ from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES +from cudf.utils.dtypes import cudf_dtype_to_pa_type @pytest.mark.parametrize( @@ -423,7 +424,9 @@ def test_get_ind_sequence(): def test_contains_scalar(data, scalar, expect): sr = cudf.Series(data) expect = cudf.Series(expect) - got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -455,7 +458,9 @@ def test_contains_scalar(data, scalar, expect): def test_contains_null_search_key(data, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -518,12 +523,12 @@ def test_contains_invalid(data, scalar): ), ( [["d", None, "e"], [None, "f"], []], - cudf.Scalar(cudf.NA, "O"), + pa.scalar(None, type=pa.string()), [None, None, None], ), ( [None, [10, 9, 8], [5, 8, None]], - cudf.Scalar(cudf.NA, "int64"), + pa.scalar(None, type=pa.int64()), [None, None, None], ), ], @@ -532,7 +537,11 @@ def test_index(data, search_key, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="int32") if is_scalar(search_key): - got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) + got = sr.list.index( + pa.scalar( + search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) + ) + ) else: got = sr.list.index( cudf.Series(search_key, dtype=sr.dtype.element_type) From 76f1c8ba9f2fd7ab6a6f3fd017ce11dd27963827 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 8 Jan 2025 15:02:10 -0600 Subject: [PATCH 8/9] Use latest ci-conda images (#17690) Use `ci-conda:latest` tags for all jobs. All jobs should now support `ci-conda:latest`, and older pinnings are probably not necessary anymore. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/17690 --- .github/workflows/build.yaml | 2 +- .github/workflows/pr.yaml | 6 +++--- .github/workflows/test.yaml | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fb7182f4133..65aebfb7f8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9d79733703c..e955b8f1f80 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -186,7 +186,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -207,7 +207,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -217,7 +217,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 858352f515d..dc82c17022a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -94,7 +94,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -106,7 +106,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit From cb77046d8baad31f4856c097f7052b3a3858c363 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Wed, 8 Jan 2025 21:05:41 -0500 Subject: [PATCH 9/9] Bump Polars version to <1.18 (#17632) This PR upgrades the Polars version to 1.17. It xfails some polars tests due to known issues and adds the `maintain_order` param to joins (not implemented yet). Notable change Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17632 --- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf-polars/meta.yaml | 4 +- dependencies.yaml | 2 +- python/cudf_polars/cudf_polars/dsl/ir.py | 42 ++++++++++++++++--- .../cudf_polars/cudf_polars/dsl/translate.py | 4 +- .../cudf_polars/cudf_polars/testing/plugin.py | 21 ++++++++++ python/cudf_polars/pyproject.toml | 4 +- python/cudf_polars/tests/test_join.py | 11 ++++- 9 files changed, 76 insertions(+), 16 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a4b3f4fe174..6ff9a5f832b 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -66,7 +66,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<19.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 7173c955116..e82192b8cdb 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -64,7 +64,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.15 +- polars>=1.11,<1.18 - pre-commit - pyarrow>=14.0.0,<19.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index b6c03dc1bc2..7a0005497df 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %} {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %} @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.15 + - polars >=1.11,<1.18 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index b0f217a6770..50b4cd3c372 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -747,7 +747,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.15 + - polars>=1.11,<1.18 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 1c1d4860eec..fd56329a48e 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """ DSL nodes for the LogicalPlan of polars. @@ -34,9 +34,11 @@ from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: - from collections.abc import Callable, Hashable, MutableMapping, Sequence + from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence from typing import Literal + from polars.polars import _expr_nodes as pl_expr + from cudf_polars.typing import Schema @@ -1019,7 +1021,27 @@ class ConditionalJoin(IR): __slots__ = ("ast_predicate", "options", "predicate") _non_child = ("schema", "predicate", "options") predicate: expr.Expr - options: tuple + """Expression predicate to join on""" + options: tuple[ + tuple[ + str, + pl_expr.Operator | Iterable[pl_expr.Operator], + ], + bool, + tuple[int, int] | None, + str, + bool, + Literal["none", "left", "right", "left_right", "right_left"], + ] + """ + tuple of options: + - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions + - join_nulls: do nulls compare equal? + - slice: optional slice to perform after joining. + - suffix: string suffix for right columns if names match + - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any + """ def __init__( self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR @@ -1029,15 +1051,16 @@ def __init__( self.options = options self.children = (left, right) self.ast_predicate = to_ast(predicate) - _, join_nulls, zlice, suffix, coalesce = self.options + _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options # Preconditions from polars assert not join_nulls assert not coalesce + assert maintain_order == "none" if self.ast_predicate is None: raise NotImplementedError( f"Conditional join with predicate {predicate}" ) # pragma: no cover; polars never delivers expressions we can't handle - self._non_child_args = (self.ast_predicate, zlice, suffix) + self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order) @classmethod def do_evaluate( @@ -1045,6 +1068,7 @@ def do_evaluate( predicate: plc.expressions.Expression, zlice: tuple[int, int] | None, suffix: str, + maintain_order: Literal["none", "left", "right", "left_right", "right_left"], left: DataFrame, right: DataFrame, ) -> DataFrame: @@ -1088,6 +1112,7 @@ class Join(IR): tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ] """ tuple of options: @@ -1096,6 +1121,7 @@ class Join(IR): - slice: optional slice to perform after joining. - suffix: string suffix for right columns if names match - coalesce: should key columns be coalesced (only makes sense for outer joins) + - maintain_order: which DataFrame row order to preserve, if any """ def __init__( @@ -1113,6 +1139,9 @@ def __init__( self.options = options self.children = (left, right) self._non_child_args = (self.left_on, self.right_on, self.options) + # TODO: Implement maintain_order + if options[5] != "none": + raise NotImplementedError("maintain_order not implemented yet") if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -1222,12 +1251,13 @@ def do_evaluate( tuple[int, int] | None, str, bool, + Literal["none", "left", "right", "left_right", "right_left"], ], left: DataFrame, right: DataFrame, ) -> DataFrame: """Evaluate and return a dataframe.""" - how, join_nulls, zlice, suffix, coalesce = options + how, join_nulls, zlice, suffix, coalesce, _ = options if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 37cf36dc4dd..2138ac0c700 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Translate polars IR representation to ours.""" @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (4, 0): + if (version := self.visitor.version()) >= (4, 3): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 87628242838..c16df320ceb 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-0]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-2]": "Need to add include_file_path to IR", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", @@ -140,6 +145,22 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero", + "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero", "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype", diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 5904942aea2..9fb9bbf391e 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.15", + "polars>=1.11,<1.18", "pylibcudf==25.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 2fcbbf21f1c..f1f47bfb9f1 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -53,6 +53,15 @@ def right(): ) +@pytest.mark.parametrize( + "maintain_order", ["left", "left_right", "right_left", "right"] +) +def test_join_maintain_order_param_unsupported(left, right, maintain_order): + q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "join_expr", [