diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index fb7182f4133..65aebfb7f8c 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -62,7 +62,7 @@ jobs: arch: "amd64" branch: ${{ inputs.branch }} build_type: ${{ inputs.build_type || 'branch' }} - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" date: ${{ inputs.date }} node_type: "gpu-v100-latest-1" run_script: "ci/build_docs.sh" diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 9d79733703c..e955b8f1f80 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -186,7 +186,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" static-configure: needs: checks @@ -207,7 +207,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" docs-build: needs: conda-python-build @@ -217,7 +217,7 @@ jobs: build_type: pull-request node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/build_docs.sh" wheel-build-libcudf: needs: checks diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 858352f515d..dc82c17022a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -41,7 +41,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_cpp_memcheck.sh" static-configure: secrets: inherit @@ -94,7 +94,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_java.sh" conda-notebook-tests: secrets: inherit @@ -106,7 +106,7 @@ jobs: sha: ${{ inputs.sha }} node_type: "gpu-v100-latest-1" arch: "amd64" - container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11" + container_image: "rapidsai/ci-conda:latest" run_script: "ci/test_notebooks.sh" wheel-tests-cudf: secrets: inherit diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 73c4567d3a4..94d27d976c3 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "io/utilities/getenv_or.hpp" + #include #include #include @@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) CUDF_EXPORT auto& kernel_pinned_copy_threshold() { // use cudaMemcpyAsync for all pinned copies - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0); return threshold; } @@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 0; + static std::atomic threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0); return threshold; } diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py index 9c436dfad18..cad4b1aa72c 100644 --- a/python/cudf/cudf/api/types.py +++ b/python/cudf/cudf/api/types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Define common type operations.""" @@ -13,6 +13,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa from pandas.api import types as pd_types import cudf @@ -144,6 +145,7 @@ def is_scalar(val): cudf.Scalar, cudf._lib.scalar.DeviceScalar, cudf.core.tools.datetimes.DateOffset, + pa.Scalar, ), ) or ( pd_types.is_scalar(val) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c2f3c782d10..2806a1f6c23 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -350,7 +350,7 @@ def names(self, values): self.name = values[0] - def _clean_nulls_from_index(self): + def _pandas_repr_compatible(self): """ Convert all na values(if any) in Index object to `` as a preprocessing step to `__repr__` methods. diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index b49f5154697..0fe47255368 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -53,7 +53,6 @@ def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: 1 this is it dtype: object """ - sep = cudf.Scalar(separator, dtype="str") return cudf.Series._from_column( - text._column.byte_pair_encoding(self.merge_pairs, sep) + text._column.byte_pair_encoding(self.merge_pairs, separator) ) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 24b657f1c32..e23ca810065 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -77,6 +77,7 @@ from cudf._typing import ColumnLike, Dtype, ScalarLike from cudf.core.column.numerical import NumericalColumn + from cudf.core.column.strings import StringColumn if PANDAS_GE_210: NumpyExtensionArray = pd.arrays.NumpyExtensionArray @@ -92,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible): "min", } + _PANDAS_NA_REPR = str(pd.NA) + def data_array_view( self, *, mode: Literal["write", "read"] = "write" ) -> "cuda.devicearray.DeviceNDArray": @@ -176,6 +179,17 @@ def __repr__(self): f"dtype: {self.dtype}" ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + if self.has_nulls(): + return self.astype("str").fillna(self._PANDAS_NA_REPR) + return self + def to_pandas( self, *, @@ -239,8 +253,12 @@ def find_and_replace( def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self: plc_column = plc.replace.clamp( self.to_pylibcudf(mode="read"), - cudf.Scalar(lo, self.dtype).device_value.c_value, - cudf.Scalar(hi, self.dtype).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype)) + ), + plc.interop.from_arrow( + pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype)) + ), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -1015,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase: # https://github.com/rapidsai/cudf/issues/14515 by # providing a mode in which cudf::contains does not mask # the result. - result = result.fillna(cudf.Scalar(rhs.null_count > 0)) + result = result.fillna(rhs.null_count > 0) return result def as_mask(self) -> Buffer: @@ -1981,12 +1999,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - cudf.Scalar( - arbitrary.start, dtype=np.dtype(np.int64) - ).device_value.c_value, - cudf.Scalar( - arbitrary.step, dtype=np.dtype(np.int64) - ).device_value.c_value, + plc.interop.from_arrow( + pa.scalar(arbitrary.start, type=pa.int64()) + ), + plc.interop.from_arrow( + pa.scalar(arbitrary.step, type=pa.int64()) + ), ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index b6a4122ebb9..1bde7d27700 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -212,6 +212,8 @@ class DatetimeColumn(column.ColumnBase): "__rsub__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, @@ -351,8 +353,8 @@ def is_year_end(self) -> ColumnBase: day_of_year = self.day_of_year leap_dates = self.is_leap_year - leap = day_of_year == cudf.Scalar(366) - non_leap = day_of_year == cudf.Scalar(365) + leap = day_of_year == 366 + non_leap = day_of_year == 365 return leap.copy_if_else(non_leap, leap_dates).fillna(False) @property diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6283e498842..6fc2b5d4ca2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -28,6 +28,7 @@ from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class ListColumn(ColumnBase): @@ -67,6 +68,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @cached_property def memory_usage(self): n = super().memory_usage @@ -274,7 +285,7 @@ def as_string_column(self) -> cudf.core.column.StringColumn: with acquire_spill_lock(): plc_column = plc.strings.convert.convert_lists.format_list_column( lc.to_pylibcudf(mode="read"), - cudf.Scalar("None").device_value.c_value, + plc.interop.from_arrow(pa.scalar("None")), separators.to_pylibcudf(mode="read"), ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] @@ -380,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase: ) @acquire_spill_lock() - def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.contains( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), ) ) @acquire_spill_lock() - def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase: + def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase: return type(self).from_pylibcudf( plc.lists.index_of( self.to_pylibcudf(mode="read"), - search_key.device_value.c_value, + plc.interop.from_arrow(search_key), plc.lists.DuplicateFindOption.FIND_FIRST, ) ) @@ -558,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType: dtype: bool """ return self._return_or_inplace( - self._column.contains_scalar(cudf.Scalar(search_key)) + self._column.contains_scalar(pa.scalar(search_key)) ) def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: @@ -607,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType: """ if is_scalar(search_key): - result = self._column.index_of_scalar(cudf.Scalar(search_key)) + result = self._column.index_of_scalar(pa.scalar(search_key)) else: result = self._column.index_of_column(as_column(search_key)) return self._return_or_inplace(result) diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 8fe5299fcdd..70103745926 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -7,6 +7,7 @@ import numpy as np import pandas as pd +import pyarrow as pa from numba.np import numpy_support from typing_extensions import Self @@ -382,12 +383,8 @@ def as_string_column(self) -> cudf.core.column.StringColumn: elif self.dtype.kind == "b": conv_func = functools.partial( plc.strings.convert.convert_booleans.from_booleans, - true_string=cudf.Scalar( - "True", dtype="str" - ).device_value.c_value, - false_string=cudf.Scalar( - "False", dtype="str" - ).device_value.c_value, + true_string=plc.interop.from_arrow(pa.scalar("True")), + false_string=plc.interop.from_arrow(pa.scalar("False")), ) elif self.dtype.kind in {"i", "u"}: conv_func = plc.strings.convert.convert_integers.from_integers diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcdcb789f23..20eded9a27f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -302,8 +302,10 @@ def cat(self, others=None, sep=None, na_rep=None): with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) else: @@ -359,8 +361,10 @@ def cat(self, others=None, sep=None, na_rep=None): ) ] ), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(na_rep, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow( + pa.scalar(na_rep, type=pa.string()) + ), ) data = Column.from_pylibcudf(plc_column) @@ -522,11 +526,9 @@ def join( with acquire_spill_lock(): plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, - cudf._lib.scalar.DeviceScalar( - "", cudf.dtype("object") - ).c_value, + plc.interop.from_arrow(pa.scalar(sep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), + plc.interop.from_arrow(pa.scalar("")), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -547,8 +549,8 @@ def join( plc_column = plc.strings.combine.join_list_elements( strings_column.to_pylibcudf(mode="read"), sep_column.to_pylibcudf(mode="read"), - cudf.Scalar(sep_na_rep).device_value.c_value, - cudf.Scalar(string_na_rep).device_value.c_value, + plc.interop.from_arrow(pa.scalar(sep_na_rep)), + plc.interop.from_arrow(pa.scalar(string_na_rep)), plc.strings.combine.SeparatorOnNulls.YES, plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT, ) @@ -800,14 +802,14 @@ def contains( else: if case is False: input_column = self.lower()._column # type: ignore[union-attr] - plc_pat = cudf.Scalar(pat.lower(), dtype="str") # type: ignore[union-attr] + pat_normed = pat.lower() # type: ignore[union-attr] else: input_column = self._column - plc_pat = cudf.Scalar(pat, dtype="str") + pat_normed = pat with acquire_spill_lock(): plc_result = plc.strings.find.contains( input_column.to_pylibcudf(mode="read"), - plc_pat.device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat_normed)), ) result_col = Column.from_pylibcudf(plc_result) else: @@ -892,8 +894,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex: with acquire_spill_lock(): plc_result = plc.strings.contains.like( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat, "str").device_value.c_value, - cudf.Scalar(esc, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(esc)), ) result = Column.from_pylibcudf(plc_result) @@ -1071,14 +1073,14 @@ def replace( plc.strings.regex_program.RegexProgram.create( pat, plc.strings.regex_flags.RegexFlags.DEFAULT ), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl)), n, ) else: plc_result = plc.strings.replace.replace( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(pat).device_value.c_value, - cudf.Scalar(repl).device_value.c_value, + plc.interop.from_arrow(pa.scalar(pat)), + plc.interop.from_arrow(pa.scalar(repl)), n, ) result = Column.from_pylibcudf(plc_result) @@ -1194,13 +1196,13 @@ def slice( 2 cm dtype: object """ - param_dtype = np.dtype(np.int32) + param_dtype = pa.int32() with acquire_spill_lock(): plc_result = plc.strings.slice.slice_strings( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(start, param_dtype).device_value.c_value, - cudf.Scalar(stop, param_dtype).device_value.c_value, - cudf.Scalar(step, param_dtype).device_value.c_value, + plc.interop.from_arrow(pa.scalar(start, param_dtype)), + plc.interop.from_arrow(pa.scalar(stop, param_dtype)), + plc.interop.from_arrow(pa.scalar(step, param_dtype)), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -2174,7 +2176,7 @@ def filter_alphanum( plc.strings.char_types.StringCharacterTypes.ALL_TYPES if keep else plc.strings.char_types.StringCharacterTypes.ALPHANUM, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), plc.strings.char_types.StringCharacterTypes.ALPHANUM if keep else plc.strings.char_types.StringCharacterTypes.ALL_TYPES, @@ -2318,7 +2320,7 @@ def slice_replace( with acquire_spill_lock(): plc_result = plc.strings.replace.replace_slice( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), start, stop, ) @@ -2499,7 +2501,7 @@ def get_json_object( with acquire_spill_lock(): plc_result = plc.json.get_json_object( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(json_path, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(json_path)), options, ) result = Column.from_pylibcudf(plc_result) @@ -2657,7 +2659,12 @@ def split( if regex is True: data = self._column.split_re(pat, n) else: - data = self._column.split(cudf.Scalar(pat, "str"), n) + data = self._column.split( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2667,7 +2674,7 @@ def split( result_table = self._column.split_record_re(pat, n) else: result_table = self._column.split_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2829,7 +2836,12 @@ def rsplit( if regex is True: data = self._column.rsplit_re(pat, n) else: - data = self._column.rsplit(cudf.Scalar(pat, "str"), n) + data = self._column.rsplit( + plc.interop.from_arrow( + pa.scalar(pat, type=pa.string()) + ), + n, + ) if len(data) == 1 and data[0].null_count == len(self._column): result_table = {} else: @@ -2839,7 +2851,7 @@ def rsplit( result_table = self._column.rsplit_record_re(pat, n) else: result_table = self._column.rsplit_record( - cudf.Scalar(pat, "str"), n + plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n ) return self._return_or_inplace(result_table, expand=expand) @@ -2924,7 +2936,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.partition(cudf.Scalar(sep, "str")), + self._column.partition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -2989,7 +3003,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex: sep = " " return self._return_or_inplace( - self._column.rpartition(cudf.Scalar(sep, "str")), + self._column.rpartition( + plc.interop.from_arrow(pa.scalar(sep, type=pa.string())) + ), expand=expand, ) @@ -3303,7 +3319,7 @@ def _strip( plc_result = plc.strings.strip.strip( self._column.to_pylibcudf(mode="read"), side, - cudf.Scalar(to_strip, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -3920,7 +3936,7 @@ def _starts_ends_with( f"{type(pat).__name__}" ) elif is_scalar(pat): - plc_pat = cudf.Scalar(pat, "str").device_value.c_value + plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string())) else: plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( mode="read" @@ -4120,7 +4136,7 @@ def _find( with acquire_spill_lock(): plc_result = method( self._column.to_pylibcudf(mode="read"), - cudf.Scalar(sub, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(sub, type=pa.string())), start, end, ) @@ -4603,7 +4619,7 @@ def filter_characters( plc.strings.translate.FilterType.KEEP if keep else plc.strings.translate.FilterType.REMOVE, - cudf.Scalar(repl, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar(repl, type=pa.string())), ) result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) @@ -4710,10 +4726,10 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: if isinstance(delim, Column): result = self._return_or_inplace( - self._column.tokenize_column(delim), + self._column.tokenize_column(delim), # type: ignore[arg-type] retain_index=False, ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): result = self._return_or_inplace( self._column.tokenize_scalar(delim), retain_index=False, @@ -4851,10 +4867,10 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) if isinstance(delim, Column): return self._return_or_inplace( - self._column.count_tokens_column(delim) + self._column.count_tokens_column(delim) # type: ignore[arg-type] ) - elif isinstance(delim, cudf.Scalar): + elif isinstance(delim, plc.Scalar): return self._return_or_inplace( self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) @@ -5112,7 +5128,7 @@ def replace_tokens( self._column.replace_tokens( targets_column, # type: ignore[arg-type] replacements_column, # type: ignore[arg-type] - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5181,8 +5197,10 @@ def filter_tokens( return self._return_or_inplace( self._column.filter_tokens( min_token_length, - cudf.Scalar(replacement, dtype="str"), - cudf.Scalar(delimiter, dtype="str"), + plc.interop.from_arrow( + pa.scalar(replacement, type=pa.string()) + ), + plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())), ), ) @@ -5501,12 +5519,12 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: def _massage_string_arg( value, name, allow_col: bool = False -) -> StringColumn | cudf.Scalar: +) -> StringColumn | plc.Scalar: if isinstance(value, cudf.Scalar): return value if isinstance(value, str): - return cudf.Scalar(value, dtype="str") + return plc.interop.from_arrow(pa.scalar(value, type=pa.string())) allowed_types = ["Scalar"] @@ -5747,8 +5765,8 @@ def sum( with acquire_spill_lock(): plc_column = plc.strings.combine.join_strings( result_col.to_pylibcudf(mode="read"), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow(pa.scalar(None, type=pa.string())), ) return Column.from_pylibcudf(plc_column).element_indexing(0) else: @@ -5766,7 +5784,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: self.to_pylibcudf(mode="read") ) result = Column.from_pylibcudf(plc_column) - return (result > cudf.Scalar(0, dtype="int8")).fillna(False) + return (result > np.int8(0)).fillna(False) elif out_dtype.kind in {"i", "u"}: if not self.is_integer().all(): raise ValueError( @@ -6033,8 +6051,10 @@ def _binaryop( rhs.to_pylibcudf(mode="read"), ] ), - cudf.Scalar("").device_value.c_value, - cudf.Scalar(None, "str").device_value.c_value, + plc.interop.from_arrow(pa.scalar("")), + plc.interop.from_arrow( + pa.scalar(None, type=pa.string()) + ), ) return Column.from_pylibcudf(plc_column) elif op in { @@ -6120,11 +6140,11 @@ def jaccard_index(self, other: Self, width: int) -> NumericalColumn: return type(self).from_pylibcudf(result) # type: ignore[return-value] @acquire_spill_lock() - def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self: result = plc.nvtext.generate_ngrams.generate_ngrams( self.to_pylibcudf(mode="read"), ngrams, - separator.device_value.c_value, + separator, ) return type(self).from_pylibcudf(result) # type: ignore[return-value] @@ -6160,13 +6180,13 @@ def edit_distance_matrix(self) -> ListColumn: def byte_pair_encoding( self, merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, - separator: cudf.Scalar, + separator: str, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.byte_pair_encode.byte_pair_encoding( self.to_pylibcudf(mode="read"), merge_pairs, - separator.device_value.c_value, + plc.interop.from_arrow(pa.scalar(separator)), ) ) @@ -6174,15 +6194,15 @@ def byte_pair_encoding( def ngrams_tokenize( self, ngrams: int, - delimiter: cudf.Scalar, - separator: cudf.Scalar, + delimiter: plc.Scalar, + separator: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.ngrams_tokenize.ngrams_tokenize( self.to_pylibcudf(mode="read"), ngrams, - delimiter.device_value.c_value, - separator.device_value.c_value, + delimiter, + separator, ) ) @@ -6205,14 +6225,14 @@ def normalize_characters(self, do_lower: bool = True) -> Self: @acquire_spill_lock() def replace_tokens( - self, targets: Self, replacements: Self, delimiter: cudf.Scalar + self, targets: Self, replacements: Self, delimiter: plc.Scalar ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.replace_tokens( self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read"), replacements.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) ) @@ -6220,15 +6240,15 @@ def replace_tokens( def filter_tokens( self, min_token_length: int, - replacement: cudf.Scalar, - delimiter: cudf.Scalar, + replacement: plc.Scalar, + delimiter: plc.Scalar, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.replace.filter_tokens( self.to_pylibcudf(mode="read"), min_token_length, - replacement.device_value.c_value, - delimiter.device_value.c_value, + replacement, + delimiter, ) ) @@ -6279,10 +6299,10 @@ def subword_tokenize( return tokens, masks, metadata @acquire_spill_lock() - def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + def tokenize_scalar(self, delimiter: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6296,10 +6316,10 @@ def tokenize_column(self, delimiters: Self) -> Self: ) @acquire_spill_lock() - def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.count_tokens_scalar( - self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + self.to_pylibcudf(mode="read"), delimiter ) ) @@ -6324,25 +6344,25 @@ def character_tokenize(self) -> Self: def tokenize_with_vocabulary( self, vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, - delimiter: cudf.Scalar, + delimiter: str, default_id: int, ) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.tokenize_with_vocabulary( self.to_pylibcudf(mode="read"), vocabulary, - delimiter.device_value.c_value, + plc.interop.from_arrow(pa.scalar(delimiter)), default_id, ) ) @acquire_spill_lock() - def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] plc.nvtext.tokenize.detokenize( self.to_pylibcudf(mode="read"), indices.to_pylibcudf(mode="read"), - separator.device_value.c_value, + separator, ) ) @@ -6491,23 +6511,23 @@ def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]: @acquire_spill_lock() def _split_record( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> Self: plc_column = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return type(self).from_pylibcudf(plc_column) # type: ignore[return-value] - def split_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.split_record ) - def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: + def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self: return self._split_record( delimiter, maxsplit, plc.strings.split.split.rsplit_record ) @@ -6515,13 +6535,13 @@ def rsplit_record(self, delimiter: cudf.Scalar, maxsplit: int) -> Self: @acquire_spill_lock() def _split( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, maxsplit: int, method: Callable[[plc.Column, plc.Scalar, int], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, maxsplit, ) return dict( @@ -6531,21 +6551,21 @@ def _split( ) ) - def split(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.split) - def rsplit(self, delimiter: cudf.Scalar, maxsplit: int) -> dict[int, Self]: + def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]: return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit) @acquire_spill_lock() def _partition( self, - delimiter: cudf.Scalar, + delimiter: plc.Scalar, method: Callable[[plc.Column, plc.Scalar], plc.Column], ) -> dict[int, Self]: plc_table = method( self.to_pylibcudf(mode="read"), - delimiter.device_value.c_value, + delimiter, ) return dict( enumerate( @@ -6554,12 +6574,12 @@ def _partition( ) ) - def partition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def partition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.partition ) - def rpartition(self, delimiter: cudf.Scalar) -> dict[int, Self]: + def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]: return self._partition( delimiter, plc.strings.split.partition.rpartition ) diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py index ba765b50729..052a68cec98 100644 --- a/python/cudf/cudf/core/column/struct.py +++ b/python/cudf/cudf/core/column/struct.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from functools import cached_property @@ -18,6 +18,7 @@ from cudf._typing import Dtype from cudf.core.buffer import Buffer + from cudf.core.column.string import StringColumn class StructColumn(ColumnBase): @@ -51,6 +52,16 @@ def __init__( children=children, ) + def _prep_pandas_compat_repr(self) -> StringColumn | Self: + """ + Preprocess Column to be compatible with pandas repr, namely handling nulls. + + * null (datetime/timedelta) = str(pd.NaT) + * null (other types)= str(pd.NA) + """ + # TODO: handle if self.has_nulls(): case + return self + @staticmethod def _validate_dtype_instance(dtype: StructDtype) -> StructDtype: # IntervalDtype is a subclass of StructDtype, so compare types exactly diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 749ab8e837a..302178ea277 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -81,6 +81,8 @@ class TimeDeltaColumn(ColumnBase): "__rfloordiv__", } + _PANDAS_NA_REPR = str(pd.NaT) + def __init__( self, data: Buffer, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index b2121511a14..5cea35ac0d6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -1894,7 +1894,7 @@ def astype( dtype = {cc: dtype for cc in self._column_names} return super().astype(dtype, copy, errors) - def _clean_renderable_dataframe(self, output): + def _clean_renderable_dataframe(self, output: Self) -> str: """ This method takes in partial/preprocessed dataframe and returns correct representation of it with correct @@ -1929,41 +1929,7 @@ def _clean_renderable_dataframe(self, output): ) return "\n".join(lines) - def _clean_nulls_from_dataframe(self, df): - """ - This function converts all ``null`` values to ```` for - representation as a string in `__repr__`. - - Since we utilize Pandas `__repr__` at all places in our code - for formatting purposes, we convert columns to `str` dtype for - filling with `` values. - """ - for col in df._data: - if isinstance( - df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype) - ): - # TODO we need to handle this - pass - elif df._data[col].has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance( - df._data[col], - ( - cudf.core.column.DatetimeColumn, - cudf.core.column.TimeDeltaColumn, - ), - ) - else str(cudf.NA) - ) - - df[col] = df._data[col].astype("str").fillna(fill_value) - else: - df[col] = df._data[col] - - return df - - def _get_renderable_dataframe(self): + def _get_renderable_dataframe(self) -> Self: """ Takes rows and columns from pandas settings or estimation from size. pulls quadrants based off of some known parameters then style for @@ -1971,9 +1937,9 @@ def _get_renderable_dataframe(self): for printing with the dataframe. """ max_rows = pd.options.display.max_rows - nrows = np.max([len(self) if max_rows is None else max_rows, 1]) - if pd.options.display.max_rows == 0: - nrows = len(self) + if max_rows in {0, None}: + max_rows = len(self) + nrows = max(max_rows, 1) ncols = ( pd.options.display.max_columns if pd.options.display.max_columns @@ -1981,7 +1947,7 @@ def _get_renderable_dataframe(self): ) if len(self) <= nrows and self._num_columns <= ncols: - output = self.copy(deep=False) + output = self elif self.empty and len(self.index) > 0: max_seq_items = pd.options.display.max_seq_items # In case of Empty DataFrame with index, Pandas prints @@ -2041,10 +2007,7 @@ def _get_renderable_dataframe(self): lower = cudf.concat([lower_left, lower_right], axis=1) output = cudf.concat([upper, lower]) - output = self._clean_nulls_from_dataframe(output) - output.index = output.index._clean_nulls_from_index() - - return output + return output._pandas_repr_compatible() @_performance_tracking def __repr__(self): @@ -6266,10 +6229,8 @@ def isin(self, values): # TODO: propagate nulls through isin # https://github.com/rapidsai/cudf/issues/7556 - fill_value = cudf.Scalar(False) - def make_false_column_like_self(): - return column.as_column(fill_value, length=len(self), dtype="bool") + return column.as_column(False, length=len(self), dtype="bool") # Preprocess different input types into a mapping from column names to # a list of values to check. diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 8f45c6f0115..abf9f7b3686 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -820,6 +820,13 @@ def fillna( inplace=inplace, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + columns = (col._prep_pandas_compat_repr() for col in self._columns) + return self._from_data_like_self( + self._data._from_columns_like_self(columns, verify=False) + ) + @_performance_tracking def _drop_column( self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 6ae524d6346..17302311a7e 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -14,6 +14,7 @@ import cupy as cp import numpy as np import pandas as pd +import pyarrow as pa import pylibcudf as plc @@ -45,6 +46,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply +from cudf.utils.dtypes import cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -852,7 +854,9 @@ def _shift( plc.table.Table([col.to_pylibcudf(mode="read") for col in values]), [periods] * len(values), [ - cudf.Scalar(val, dtype=col.dtype).device_value.c_value + plc.interop.from_arrow( + pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype)) + ) for val, col in zip(fill_values, values) ], ) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 85be8d21d27..b535e8aabd2 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -339,7 +339,7 @@ def _values(self) -> ColumnBase: else: return column.column_empty(0, dtype=self.dtype) - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self def _is_numeric(self) -> bool: @@ -1127,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self: out.name = name return out - @classmethod @_performance_tracking - def _from_data_like_self( - cls, data: MutableMapping, name: Any = no_default - ) -> Self: - out = _index_from_data(data, name) - if name is not no_default: - out.name = name - return out + def _from_data_like_self(self, data: MutableMapping) -> Self: + return _index_from_data(data, self.name) @classmethod @_performance_tracking @@ -1494,7 +1488,7 @@ def __repr__(self) -> str: if isinstance(self._values, StringColumn): output = repr(self.to_pandas(nullable=True)) else: - output = repr(self._clean_nulls_from_index().to_pandas()) + output = repr(self._pandas_repr_compatible().to_pandas()) # We should remove all the single quotes # from the output due to the type-cast to # object dtype happening above. @@ -1650,20 +1644,6 @@ def __contains__(self, item) -> bool: hash(item) return item in self._column - def _clean_nulls_from_index(self) -> Index: - if self._values.has_nulls(): - fill_value = ( - str(cudf.NaT) - if isinstance(self, (DatetimeIndex, TimedeltaIndex)) - else str(cudf.NA) - ) - return Index._from_column( - self._column.astype("str").fillna(fill_value), - name=self.name, - ) - - return self - def any(self) -> bool: return self._column.any() @@ -2347,8 +2327,7 @@ def microsecond(self) -> Index: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - self._column.millisecond.astype("int32") - * cudf.Scalar(1000, dtype="int32") + self._column.millisecond.astype("int32") * np.int32(1000) ) + self._column.microsecond, name=self.name, @@ -3615,7 +3594,7 @@ def _is_interval(self) -> bool: def _is_boolean(self) -> bool: return False - def _clean_nulls_from_index(self) -> Self: + def _pandas_repr_compatible(self) -> Self: return self @property diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index e9ed74f804b..eded681baf0 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3255,7 +3255,7 @@ def duplicated( ) distinct = libcudf.column.Column.from_pylibcudf(plc_column) result = copying.scatter( - [cudf.Scalar(False, dtype=bool)], + [cudf.Scalar(False)], distinct, [as_column(True, length=len(self), dtype=bool)], bounds_check=False, @@ -4410,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True): index_names=self.index.names if keep_index else None, ) + def _pandas_repr_compatible(self) -> Self: + """Return Self but with columns prepared for a pandas-like repr.""" + result = super()._pandas_repr_compatible() + result.index = self.index._pandas_repr_compatible() + return result + def take(self, indices, axis=0): """Return a new frame containing the rows specified by *indices*. diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..e7efd01ca85 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -361,6 +361,13 @@ def _from_data( name=name, ) + @_performance_tracking + def _from_data_like_self(self, data: MutableMapping) -> Self: + mi = type(self)._from_data(data, name=self.name) + if mi.nlevels == self.nlevels: + mi.names = self.names + return mi + @classmethod def _simple_new( cls, @@ -1753,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int: mi = self.dropna(how="all") if dropna else self return len(mi.unique()) - def _clean_nulls_from_index(self) -> Self: - """ - Convert all na values(if any) in MultiIndex object - to `` as a preprocessing step to `__repr__` methods. - """ - index_df = self.to_frame(index=False, name=list(range(self.nlevels))) - return MultiIndex.from_frame( - index_df._clean_nulls_from_dataframe(index_df), names=self.names - ) - @_performance_tracking def memory_usage(self, deep: bool = False) -> int: usage = sum(col.memory_usage for col in self._columns) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 49c2c8cf387..805f9f9a9f9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1449,35 +1449,16 @@ def __repr__(self): warnings.simplefilter("ignore", FutureWarning) preprocess = cudf.concat([top, bottom]) else: - preprocess = self.copy() - preprocess.index = preprocess.index._clean_nulls_from_index() - if ( - preprocess.nullable - and not isinstance( - preprocess.dtype, - ( - cudf.CategoricalDtype, - cudf.ListDtype, - cudf.StructDtype, - cudf.core.dtypes.DecimalDtype, - ), - ) - ) or preprocess.dtype.kind == "m": - fill_value = ( - str(cudf.NaT) - if preprocess.dtype.kind in "mM" - else str(cudf.NA) - ) - output = repr( - preprocess.astype("str").fillna(fill_value).to_pandas() - ) - elif isinstance(preprocess.dtype, cudf.CategoricalDtype): + preprocess = self + if isinstance(preprocess.dtype, cudf.CategoricalDtype): min_rows = ( height if pd.get_option("display.min_rows") == 0 else pd.get_option("display.min_rows") ) show_dimensions = pd.get_option("display.show_dimensions") + preprocess = preprocess.copy(deep=False) + preprocess.index = preprocess.index._pandas_repr_compatible() if preprocess.dtype.categories.dtype.kind == "f": pd_series = ( preprocess.astype("str") @@ -1502,7 +1483,7 @@ def __repr__(self): na_rep=str(cudf.NA), ) else: - output = repr(preprocess.to_pandas()) + output = repr(preprocess._pandas_repr_compatible().to_pandas()) lines = output.split("\n") if isinstance(preprocess.dtype, cudf.CategoricalDtype): @@ -4125,8 +4106,8 @@ def microsecond(self) -> Series: # Need to manually promote column to int32 because # pandas-matching binop behaviour requires that this # __mul__ returns an int16 column. - extra = self.series._column.millisecond.astype("int32") * cudf.Scalar( - 1000, dtype="int32" + extra = self.series._column.millisecond.astype("int32") * np.int32( + 1000 ) return self._return_result_like_self(micro + extra) diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index fb8b9b3131c..58dabc85491 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -42,9 +42,8 @@ def tokenize( """ if delimiter is None: delimiter = "" - delim = cudf.Scalar(delimiter, dtype="str") result = text._column.tokenize_with_vocabulary( - self.vocabulary, delim, default_id + self.vocabulary, delimiter, default_id ) return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py index 2f8a6d9e5e7..e2c332f34f5 100644 --- a/python/cudf/cudf/core/window/rolling.py +++ b/python/cudf/cudf/core/window/rolling.py @@ -1,10 +1,11 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION +# Copyright (c) 2020-2025, NVIDIA CORPORATION from __future__ import annotations import warnings from typing import TYPE_CHECKING import numba +import numpy as np import pandas as pd from pandas.api.indexers import BaseIndexer @@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name): end = as_column(end, dtype="int32") idx = as_column(range(len(start))) - preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype( - "int32" - ) - following_window = (end - idx - cudf.Scalar(1, "int32")).astype( - "int32" - ) + preceding_window = (idx - start + np.int32(1)).astype("int32") + following_window = (end - idx - np.int32(1)).astype("int32") window = None else: preceding_window = as_column(self.window) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index da0aa5be6f5..b1f81edfc54 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import functools import operator @@ -14,6 +14,7 @@ from cudf.core.column.column import column_empty from cudf.testing import assert_eq from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES +from cudf.utils.dtypes import cudf_dtype_to_pa_type @pytest.mark.parametrize( @@ -423,7 +424,9 @@ def test_get_ind_sequence(): def test_contains_scalar(data, scalar, expect): sr = cudf.Series(data) expect = cudf.Series(expect) - got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -455,7 +458,9 @@ def test_contains_scalar(data, scalar, expect): def test_contains_null_search_key(data, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="bool") - got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type)) + got = sr.list.contains( + pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type)) + ) assert_eq(expect, got) @@ -518,12 +523,12 @@ def test_contains_invalid(data, scalar): ), ( [["d", None, "e"], [None, "f"], []], - cudf.Scalar(cudf.NA, "O"), + pa.scalar(None, type=pa.string()), [None, None, None], ), ( [None, [10, 9, 8], [5, 8, None]], - cudf.Scalar(cudf.NA, "int64"), + pa.scalar(None, type=pa.int64()), [None, None, None], ), ], @@ -532,7 +537,11 @@ def test_index(data, search_key, expect): sr = cudf.Series(data) expect = cudf.Series(expect, dtype="int32") if is_scalar(search_key): - got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type)) + got = sr.list.index( + pa.scalar( + search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type) + ) + ) else: got = sr.list.index( cudf.Series(search_key, dtype=sr.dtype.element_type) diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py index bf0c97adb00..2cb742727cc 100644 --- a/python/cudf/cudf/tests/test_repr.py +++ b/python/cudf/cudf/tests/test_repr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import textwrap @@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype): cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"), textwrap.dedent( """ - 0 0 days 00:00:00.001000000 - 1 0 days 00:00:00.000200000 - 2 0 days 00:00:00.003000000 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:00.000200 + 2 0 days 00:00:00.003000 dtype: timedelta64[ns] """ ), @@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.012 - 1 0 days 00:00:00.012 - 2 0 days 00:00:00.022 - 3 0 days 00:00:00.343 - 4 0 days 01:12:33.534 - 5 0 days 00:07:15.342 + 0 0 days 00:00:00.012000 + 1 0 days 00:00:00.012000 + 2 0 days 00:00:00.022000 + 3 0 days 00:00:00.343000 + 4 0 days 01:12:33.534000 + 5 0 days 00:07:15.342000 dtype: timedelta64[ms] """ ), @@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 0 days 00:00:00.001 - 1 0 days 00:00:01.132 - 2 0 days 06:27:03.231 - 3 0 days 00:00:00.233 - 4 0 days 00:00:00 - 5 0 days 00:00:00.332 - 6 0 days 00:00:00.323 + 0 0 days 00:00:00.001000 + 1 0 days 00:00:01.132000 + 2 0 days 06:27:03.231000 + 3 0 days 00:00:00.233000 + 4 0 days 00:00:00 + 5 0 days 00:00:00.332000 + 6 0 days 00:00:00.323000 dtype: timedelta64[ms] """ ), @@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 dtype: timedelta64[ms] """ ), @@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype): ), textwrap.dedent( """ - 0 157937 days 02:23:52.432 - 1 1 days 13:25:36.784 - 2 2 days 20:09:05.345 - 3 2 days 14:03:52.411 - 4 11573 days 23:39:03.241 - 5 42 days 01:35:48.734 - 6 0 days 00:00:23.234 + 0 157937 days 02:23:52.432000 + 1 1 days 13:25:36.784000 + 2 2 days 20:09:05.345000 + 3 2 days 14:03:52.411000 + 4 11573 days 23:39:03.241000 + 5 42 days 01:35:48.734000 + 6 0 days 00:00:23.234000 Name: abc, dtype: timedelta64[ms] """ ), diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index 3cd18e24d30..c16df320ceb 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[read_parquet-]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-0]": "Need to add include_file_path to IR", + "tests/unit/io/test_write.py::test_write_async[-2]": "Need to add include_file_path to IR", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed", "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly", @@ -190,6 +195,19 @@ def pytest_configure(config: pytest.Config) -> None: } +TESTS_TO_SKIP: Mapping[str, str] = { + # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks + # for obsolete timezone names. However, the chrono_tz package that + # polars uses doesn't read /usr/share/zoneinfo, instead packaging + # the current zoneinfo database from IANA. Consequently, when this + # hypothesis-generated test runs and generates timezones from the + # available zoneinfo-reported timezones, we can get an error from + # polars that the requested timezone is unknown. + # Since this is random, just skip it, rather than xfailing. + "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", +} + + def pytest_collection_modifyitems( session: pytest.Session, config: pytest.Config, items: list[pytest.Item] ) -> None: @@ -198,5 +216,7 @@ def pytest_collection_modifyitems( # Don't xfail tests if running without fallback return for item in items: - if item.nodeid in EXPECTED_FAILURES: + if item.nodeid in TESTS_TO_SKIP: + item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) + elif item.nodeid in EXPECTED_FAILURES: item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index 0a08d46525b..584ac549ddd 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -53,6 +53,15 @@ def right(): ) +@pytest.mark.parametrize( + "maintain_order", ["left", "left_right", "right_left", "right"] +) +def test_join_maintain_order_param_unsupported(left, right, maintain_order): + q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order) + + assert_ir_translation_raises(q, NotImplementedError) + + @pytest.mark.parametrize( "join_expr", [