From 4e33493e69de8a46cd71f657bcdd7b37b363e963 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:27:33 -0800 Subject: [PATCH 1/2] Use more pylibcudf.types instead of cudf._lib.types --- python/cudf/cudf/_lib/__init__.py | 7 - python/cudf/cudf/_lib/aggregation.pyx | 7 +- python/cudf/cudf/_lib/column.pyx | 10 +- python/cudf/cudf/_lib/orc.pyx | 8 +- python/cudf/cudf/_lib/reduce.pyx | 20 +- python/cudf/cudf/_lib/scalar.pyx | 69 ++---- python/cudf/cudf/_lib/types.pxd | 5 - python/cudf/cudf/_lib/types.pyx | 225 +++++--------------- python/cudf/cudf/core/column/categorical.py | 4 +- python/cudf/cudf/core/column/column.py | 4 +- python/cudf/cudf/core/dtypes.py | 4 +- python/cudf/cudf/utils/dtypes.py | 2 +- 12 files changed, 98 insertions(+), 267 deletions(-) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index cdf7cbe13c4..3175fb5a0b2 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -1,6 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import numpy as np - from . import ( copying, csv, @@ -19,8 +17,3 @@ strings_udf, text, ) - -MAX_COLUMN_SIZE = np.iinfo(np.int32).max -MAX_COLUMN_SIZE_STR = "INT32_MAX" -MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max -MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX" diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx index 3c96b90f0a1..a5519f8ca25 100644 --- a/python/cudf/cudf/_lib/aggregation.pyx +++ b/python/cudf/cudf/_lib/aggregation.pyx @@ -6,7 +6,7 @@ from numba.np import numpy_support import pylibcudf import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES +from cudf._lib.types import dtype_to_pylibcudf_type from cudf.utils import cudautils _agg_name_map = { @@ -198,13 +198,10 @@ class Aggregation: type_signature = (nb_type[:],) ptx_code, output_dtype = cudautils.compile_udf(op, type_signature) output_np_dtype = cudf.dtype(output_dtype) - if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: - raise TypeError(f"Result of window function has unsupported dtype {op[1]}") - return cls( pylibcudf.aggregation.udf( ptx_code, - pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]), + dtype_to_pylibcudf_type(output_np_dtype) ) ) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9cbe11d61ac..598250d04b9 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -31,12 +31,12 @@ from rmm.pylibrmm.device_buffer cimport DeviceBuffer from cudf._lib.types cimport ( dtype_from_column_view, - dtype_to_data_type, dtype_to_pylibcudf_type, ) from cudf._lib.types import dtype_from_pylibcudf_column +from pylibcudf cimport DataType as plc_DataType cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -362,7 +362,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -399,7 +399,7 @@ cdef class Column: self._data = None return mutable_column_view( - dtype, + dtype.c_obj, self.size, data, mask, @@ -425,7 +425,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data @@ -451,7 +451,7 @@ cdef class Column: cdef libcudf_types.size_type c_null_count = null_count return column_view( - dtype, + dtype.c_obj, self.size, data, mask, diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index c829cac6409..9e34fa4d049 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -22,7 +22,7 @@ from cudf._lib.utils cimport data_from_pylibcudf_io import pylibcudf as plc import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES +from cudf._lib.types import dtype_to_pylibcudf_type from cudf._lib.utils import _index_level_name, generate_pandas_metadata from cudf.core.buffer import acquire_spill_lock from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata @@ -77,11 +77,7 @@ cpdef read_orc(object filepaths_or_buffers, get_skiprows_arg(skip_rows), get_num_rows_arg(num_rows), use_index, - plc.types.DataType( - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) + dtype_to_pylibcudf_type(cudf.dtype(timestamp_type)) ) names = tbl_w_meta.column_names(include_children=False) diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx index 944753d28b8..f049d23aea2 100644 --- a/python/cudf/cudf/_lib/reduce.pyx +++ b/python/cudf/cudf/_lib/reduce.pyx @@ -6,9 +6,9 @@ from cudf.core.buffer import acquire_spill_lock from cudf._lib.column cimport Column from cudf._lib.scalar cimport DeviceScalar -from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id +from cudf._lib.types cimport dtype_to_pylibcudf_type -import pylibcudf +import pylibcudf as plc from cudf._lib.aggregation import make_aggregation @@ -49,13 +49,17 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs): return cudf.utils.dtypes._get_nan_for_dtype(col_dtype) - result = pylibcudf.reduce.reduce( + result = plc.reduce.reduce( incol.to_pylibcudf(mode="read"), make_aggregation(reduction_op, kwargs).c_obj, dtype_to_pylibcudf_type(col_dtype), ) - if is_decimal_type_id(result.type().id()): + if result.type().id() in { + plc.types.TypeId.DECIMAL128, + plc.types.TypeId.DECIMAL32, + plc.types.TypeId.DECIMAL64 + }: scale = -result.type().scale() precision = _reduce_precision(col_dtype, reduction_op, len(incol)) return DeviceScalar.from_pylibcudf( @@ -84,11 +88,11 @@ def scan(scan_op, Column incol, inclusive, **kwargs): Flag for including nulls in relevant scan """ return Column.from_pylibcudf( - pylibcudf.reduce.scan( + plc.reduce.scan( incol.to_pylibcudf(mode="read"), make_aggregation(scan_op, kwargs).c_obj, - pylibcudf.reduce.ScanType.INCLUSIVE if inclusive - else pylibcudf.reduce.ScanType.EXCLUSIVE, + plc.reduce.ScanType.INCLUSIVE if inclusive + else plc.reduce.ScanType.EXCLUSIVE, ) ) @@ -107,7 +111,7 @@ def minmax(Column incol): ------- A pair of ``(min, max)`` values of ``incol`` """ - min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read")) + min, max = plc.reduce.minmax(incol.to_pylibcudf(mode="read")) return ( cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)), cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)), diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 56712402919..caa99248a2e 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -10,23 +10,19 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -import pylibcudf +import pylibcudf as plc import cudf -from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf.core.dtypes import ListDtype, StructDtype +from cudf._lib.types import dtype_from_pylibcudf_column from cudf.core.missing import NA, NaT -cimport pylibcudf.libcudf.types as libcudf_types # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). from pylibcudf cimport Scalar as plc_Scalar -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar - -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -62,12 +58,12 @@ def gather_metadata(dtypes): """ out = [] for name, dtype in dtypes.items(): - v = pylibcudf.interop.ColumnMetadata(name) + v = plc.interop.ColumnMetadata(name) if isinstance(dtype, cudf.StructDtype): v.children_meta = gather_metadata(dtype.fields) elif isinstance(dtype, cudf.ListDtype): # Offsets column is unnamed and has no children - v.children_meta.append(pylibcudf.interop.ColumnMetadata("")) + v.children_meta.append(plc.interop.ColumnMetadata("")) v.children_meta.extend( gather_metadata({"": dtype.element_type}) ) @@ -81,7 +77,7 @@ cdef class DeviceScalar: # that from_unique_ptr is implemented is probably dereferencing this in an # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar) + self.c_value = plc.Scalar.__new__(plc.Scalar) def __init__(self, value, dtype): """ @@ -127,20 +123,20 @@ cdef class DeviceScalar: pa_array = pa.array([pa.scalar(value, type=pa_type)]) pa_table = pa.Table.from_arrays([pa_array], names=[""]) - table = pylibcudf.interop.from_arrow(pa_table) + table = plc.interop.from_arrow(pa_table) column = table.columns()[0] if isinstance(dtype, cudf.core.dtypes.DecimalDtype): if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale) + column = plc.unary.cast( + column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale) ) elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - column = pylibcudf.unary.cast( - column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale) + column = plc.unary.cast( + column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale) ) - self.c_value = pylibcudf.copying.get_element(column, 0) + self.c_value = plc.copying.get_element(column, 0) self._dtype = dtype def _to_host_scalar(self): @@ -150,7 +146,7 @@ cdef class DeviceScalar: null_type = NaT if is_datetime or is_timedelta else NA metadata = gather_metadata({"": self.dtype})[0] - ps = pylibcudf.interop.to_arrow(self.c_value, metadata) + ps = plc.interop.to_arrow(self.c_value, metadata) if not ps.is_valid: return null_type @@ -225,44 +221,19 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - if dtype is not None: self._dtype = dtype - elif cdtype.id() in { - libcudf_types.type_id.DECIMAL32, - libcudf_types.type_id.DECIMAL64, - libcudf_types.type_id.DECIMAL128, + + plc_scalar = self.c_value + if plc_scalar.type().id() in { + plc.TypeId.DECIMAL32, + plc.TypeId.DECIMAL64, + plc.TypeId.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype.id() == libcudf_types.type_id.STRUCT: - struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype.id() == libcudf_types.type_id.LIST: - if ( - <list_scalar*>self.get_raw_ptr() - )[0].view().type().id() == libcudf_types.type_id.LIST: - self._dtype = dtype_from_column_view( - (<list_scalar*>self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - <underlying_type_t_type_id>( - (<list_scalar*>self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) - else: - self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - <underlying_type_t_type_id>(cdtype.id()) - ] + self._dtype = dtype_from_pylibcudf_column(plc.Column.from_scalar(plc_scalar, 1)) def as_device_scalar(val, dtype=None): diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index c2b760490c1..18b1d26e4db 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -1,16 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. from libc.stdint cimport int32_t -from libcpp cimport bool -cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view ctypedef int32_t underlying_type_t_type_id cdef dtype_from_column_view(column_view cv) -cdef libcudf_types.data_type dtype_to_data_type(dtype) except * cpdef dtype_to_pylibcudf_type(dtype) -cdef bool is_decimal_type_id(libcudf_types.type_id tid) except * diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index f169ea12b10..777bd070b32 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from enum import IntEnum - import numpy as np import pandas as pd @@ -11,138 +9,46 @@ cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column_view cimport column_view from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view -import pylibcudf +import pylibcudf as plc import cudf -class TypeId(IntEnum): - EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY - INT8 = <underlying_type_t_type_id> libcudf_types.type_id.INT8 - INT16 = <underlying_type_t_type_id> libcudf_types.type_id.INT16 - INT32 = <underlying_type_t_type_id> libcudf_types.type_id.INT32 - INT64 = <underlying_type_t_type_id> libcudf_types.type_id.INT64 - UINT8 = <underlying_type_t_type_id> libcudf_types.type_id.UINT8 - UINT16 = <underlying_type_t_type_id> libcudf_types.type_id.UINT16 - UINT32 = <underlying_type_t_type_id> libcudf_types.type_id.UINT32 - UINT64 = <underlying_type_t_type_id> libcudf_types.type_id.UINT64 - FLOAT32 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT32 - FLOAT64 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT64 - BOOL8 = <underlying_type_t_type_id> libcudf_types.type_id.BOOL8 - TIMESTAMP_DAYS = ( - <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_DAYS - ) - TIMESTAMP_SECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_SECONDS - ) - TIMESTAMP_MILLISECONDS = ( - <underlying_type_t_type_id> ( - libcudf_types.type_id.TIMESTAMP_MILLISECONDS - ) - ) - TIMESTAMP_MICROSECONDS = ( - <underlying_type_t_type_id> ( - libcudf_types.type_id.TIMESTAMP_MICROSECONDS - ) - ) - TIMESTAMP_NANOSECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_NANOSECONDS - ) - DURATION_SECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.DURATION_SECONDS - ) - DURATION_MILLISECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MILLISECONDS - ) - DURATION_MICROSECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MICROSECONDS - ) - DURATION_NANOSECONDS = ( - <underlying_type_t_type_id> libcudf_types.type_id.DURATION_NANOSECONDS - ) - STRING = <underlying_type_t_type_id> libcudf_types.type_id.STRING - DECIMAL32 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL32 - DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64 - DECIMAL128 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL128 - STRUCT = <underlying_type_t_type_id> libcudf_types.type_id.STRUCT - - -SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = { - np.dtype("int8"): TypeId.INT8, - np.dtype("int16"): TypeId.INT16, - np.dtype("int32"): TypeId.INT32, - np.dtype("int64"): TypeId.INT64, - np.dtype("uint8"): TypeId.UINT8, - np.dtype("uint16"): TypeId.UINT16, - np.dtype("uint32"): TypeId.UINT32, - np.dtype("uint64"): TypeId.UINT64, - np.dtype("float32"): TypeId.FLOAT32, - np.dtype("float64"): TypeId.FLOAT64, - np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): TypeId.STRING, - np.dtype("bool"): TypeId.BOOL8, - np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS, -} - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - k: pylibcudf.TypeId(v).value - for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items() + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, } - -LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - # There's no equivalent to EMPTY in cudf. We translate EMPTY - # columns from libcudf to ``int8`` columns of all nulls in Python. - # ``int8`` is chosen because it uses the least amount of memory. - TypeId.EMPTY: np.dtype("int8"), - TypeId.INT8: np.dtype("int8"), - TypeId.INT16: np.dtype("int16"), - TypeId.INT32: np.dtype("int32"), - TypeId.INT64: np.dtype("int64"), - TypeId.UINT8: np.dtype("uint8"), - TypeId.UINT16: np.dtype("uint16"), - TypeId.UINT32: np.dtype("uint32"), - TypeId.UINT64: np.dtype("uint64"), - TypeId.FLOAT32: np.dtype("float32"), - TypeId.FLOAT64: np.dtype("float64"), - TypeId.BOOL8: np.dtype("bool"), - TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"), - TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"), - TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"), - TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"), - TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"), - TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"), - TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"), - TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"), - TypeId.STRING: np.dtype("object"), - TypeId.STRUCT: np.dtype("object"), -} - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - pylibcudf.TypeId(k).value: v - for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items() + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() } +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") -duration_unit_map = { - TypeId.DURATION_SECONDS: "s", - TypeId.DURATION_MILLISECONDS: "ms", - TypeId.DURATION_MICROSECONDS: "us", - TypeId.DURATION_NANOSECONDS: "ns" -} - -datetime_unit_map = { - TypeId.TIMESTAMP_SECONDS: "s", - TypeId.TIMESTAMP_MILLISECONDS: "ms", - TypeId.TIMESTAMP_MICROSECONDS: "us", - TypeId.TIMESTAMP_NANOSECONDS: "ns", -} -size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID] +size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] cdef dtype_from_lists_column_view(column_view cv): @@ -190,71 +96,40 @@ cdef dtype_from_column_view(column_view cv): scale=-cv.type().scale() ) else: - return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ <underlying_type_t_type_id>(tid) ] -cdef libcudf_types.data_type dtype_to_data_type(dtype) except *: - # Note: This function is to be phased out in favor of - # dtype_to_pylibcudf_type which will return a pylibcudf - # DataType object - cdef libcudf_types.type_id tid - if isinstance(dtype, cudf.ListDtype): - tid = libcudf_types.type_id.LIST - elif isinstance(dtype, cudf.StructDtype): - tid = libcudf_types.type_id.STRUCT - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = libcudf_types.type_id.DECIMAL128 - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = libcudf_types.type_id.DECIMAL64 - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = libcudf_types.type_id.DECIMAL32 - else: - tid = <libcudf_types.type_id> ( - <underlying_type_t_type_id> ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)])) - - if is_decimal_type_id(tid): - return libcudf_types.data_type(tid, -dtype.scale) - else: - return libcudf_types.data_type(tid) cpdef dtype_to_pylibcudf_type(dtype): if isinstance(dtype, cudf.ListDtype): - return pylibcudf.DataType(pylibcudf.TypeId.LIST) + return plc.DataType(plc.TypeId.LIST) elif isinstance(dtype, cudf.StructDtype): - return pylibcudf.DataType(pylibcudf.TypeId.STRUCT) + return plc.DataType(plc.TypeId.STRUCT) elif isinstance(dtype, cudf.Decimal128Dtype): - tid = pylibcudf.TypeId.DECIMAL128 - return pylibcudf.DataType(tid, -dtype.scale) + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) elif isinstance(dtype, cudf.Decimal64Dtype): - tid = pylibcudf.TypeId.DECIMAL64 - return pylibcudf.DataType(tid, -dtype.scale) + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) elif isinstance(dtype, cudf.Decimal32Dtype): - tid = pylibcudf.TypeId.DECIMAL32 - return pylibcudf.DataType(tid, -dtype.scale) - # libcudf types don't support localization so convert to the base type + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type elif isinstance(dtype, pd.DatetimeTZDtype): dtype = np.dtype(f"<M8[{dtype.unit}]") else: dtype = np.dtype(dtype) - return pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) - -cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *: - return tid in ( - libcudf_types.type_id.DECIMAL128, - libcudf_types.type_id.DECIMAL64, - libcudf_types.type_id.DECIMAL32, - ) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) def dtype_from_pylibcudf_lists_column(col): child = col.list_view().child() tid = child.type().id() - if tid == pylibcudf.TypeId.LIST: + if tid == plc.TypeId.LIST: return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child)) - elif tid == pylibcudf.TypeId.EMPTY: + elif tid == plc.TypeId.EMPTY: return cudf.ListDtype("int8") else: return cudf.ListDtype( @@ -274,26 +149,24 @@ def dtype_from_pylibcudf_column(col): type_ = col.type() tid = type_.id() - if tid == pylibcudf.TypeId.LIST: + if tid == plc.TypeId.LIST: return dtype_from_pylibcudf_lists_column(col) - elif tid == pylibcudf.TypeId.STRUCT: + elif tid == plc.TypeId.STRUCT: return dtype_from_pylibcudf_structs_column(col) - elif tid == pylibcudf.TypeId.DECIMAL64: + elif tid == plc.TypeId.DECIMAL64: return cudf.Decimal64Dtype( precision=cudf.Decimal64Dtype.MAX_PRECISION, scale=-type_.scale() ) - elif tid == pylibcudf.TypeId.DECIMAL32: + elif tid == plc.TypeId.DECIMAL32: return cudf.Decimal32Dtype( precision=cudf.Decimal32Dtype.MAX_PRECISION, scale=-type_.scale() ) - elif tid == pylibcudf.TypeId.DECIMAL128: + elif tid == plc.TypeId.DECIMAL128: return cudf.Decimal128Dtype( precision=cudf.Decimal128Dtype.MAX_PRECISION, scale=-type_.scale() ) else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - <underlying_type_t_type_id>(tid) - ] + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c849a9d3d2b..f6702165e29 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1189,10 +1189,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > libcudf.MAX_COLUMN_SIZE: + if newsize > np.iinfo(libcudf.types.size_type_dtype).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"size > {libcudf.types.size_type_dtype}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype, masked=True) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 1ddc79e8970..6be5382db79 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2362,10 +2362,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > libcudf.MAX_COLUMN_SIZE: + if newsize > np.iinfo(libcudf.types.size_type_dtype).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.MAX_COLUMN_SIZE_STR}" + f"size > {libcudf.types.size_type_dtype}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype, masked=True) diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 801020664da..1151b429280 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -57,7 +57,9 @@ def dtype(arbitrary): else: if np_dtype.kind in set("OU"): return np.dtype("object") - elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: + elif ( + np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 57bf08e6eec..ca8f9cac2d0 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -151,7 +151,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES: + elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) From 23f79919e06adcfc3d71f3af816fb84a05f52f0c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 17 Dec 2024 19:45:35 -0800 Subject: [PATCH 2/2] Revert some old pieces of DeviceScalar._set_dtype --- python/cudf/cudf/_lib/scalar.pyx | 46 +++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 1921460fff6..40bd50acf16 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -13,7 +13,9 @@ from libcpp.utility cimport move import pylibcudf as plc import cudf -from cudf._lib.types import dtype_from_pylibcudf_column +from cudf.core.dtypes import ListDtype, StructDtype +from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES +from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT # We currently need this cimport because some of the implementations here @@ -21,8 +23,8 @@ from cudf.core.missing import NA, NaT # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar -from pylibcudf.libcudf.scalar.scalar cimport scalar +from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID +from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar def _replace_nested(obj, check, replacement): @@ -221,19 +223,43 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): + cdef plc_TypeID cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype - - plc_scalar = self.c_value - if plc_scalar.type().id() in { - plc.TypeId.DECIMAL32, - plc.TypeId.DECIMAL64, - plc.TypeId.DECIMAL128, + elif cdtype_id in { + plc_TypeID.DECIMAL32, + plc_TypeID.DECIMAL64, + plc_TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - self._dtype = dtype_from_pylibcudf_column(plc.Column.from_scalar(plc_scalar, 1)) + elif cdtype_id == plc_TypeID.STRUCT: + struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view() + self._dtype = StructDtype({ + str(i): dtype_from_column_view(struct_table_view.column(i)) + for i in range(struct_table_view.num_columns()) + }) + elif cdtype_id == plc_TypeID.LIST: + if ( + <list_scalar*>self.get_raw_ptr() + )[0].view().type().id() == plc_TypeID.LIST: + self._dtype = dtype_from_column_view( + (<list_scalar*>self.get_raw_ptr())[0].view() + ) + else: + self._dtype = ListDtype( + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + <underlying_type_t_type_id>( + (<list_scalar*>self.get_raw_ptr())[0] + .view().type().id() + ) + ] + ) + else: + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ + <underlying_type_t_type_id>(cdtype_id) + ] def as_device_scalar(val, dtype=None):