From 4e33493e69de8a46cd71f657bcdd7b37b363e963 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 4 Dec 2024 17:27:33 -0800
Subject: [PATCH 1/2] Use more pylibcudf.types instead of cudf._lib.types

---
 python/cudf/cudf/_lib/__init__.py           |   7 -
 python/cudf/cudf/_lib/aggregation.pyx       |   7 +-
 python/cudf/cudf/_lib/column.pyx            |  10 +-
 python/cudf/cudf/_lib/orc.pyx               |   8 +-
 python/cudf/cudf/_lib/reduce.pyx            |  20 +-
 python/cudf/cudf/_lib/scalar.pyx            |  69 ++----
 python/cudf/cudf/_lib/types.pxd             |   5 -
 python/cudf/cudf/_lib/types.pyx             | 225 +++++---------------
 python/cudf/cudf/core/column/categorical.py |   4 +-
 python/cudf/cudf/core/column/column.py      |   4 +-
 python/cudf/cudf/core/dtypes.py             |   4 +-
 python/cudf/cudf/utils/dtypes.py            |   2 +-
 12 files changed, 98 insertions(+), 267 deletions(-)

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cdf7cbe13c4..3175fb5a0b2 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,6 +1,4 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import numpy as np
-
 from . import (
     copying,
     csv,
@@ -19,8 +17,3 @@
     strings_udf,
     text,
 )
-
-MAX_COLUMN_SIZE = np.iinfo(np.int32).max
-MAX_COLUMN_SIZE_STR = "INT32_MAX"
-MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max
-MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX"
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 3c96b90f0a1..a5519f8ca25 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -6,7 +6,7 @@ from numba.np import numpy_support
 import pylibcudf
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.utils import cudautils
 
 _agg_name_map = {
@@ -198,13 +198,10 @@ class Aggregation:
         type_signature = (nb_type[:],)
         ptx_code, output_dtype = cudautils.compile_udf(op, type_signature)
         output_np_dtype = cudf.dtype(output_dtype)
-        if output_np_dtype not in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
-            raise TypeError(f"Result of window function has unsupported dtype {op[1]}")
-
         return cls(
             pylibcudf.aggregation.udf(
                 ptx_code,
-                pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[output_np_dtype]),
+                dtype_to_pylibcudf_type(output_np_dtype)
             )
         )
 
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9cbe11d61ac..598250d04b9 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -31,12 +31,12 @@ from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
-    dtype_to_data_type,
     dtype_to_pylibcudf_type,
 )
 
 from cudf._lib.types import dtype_from_pylibcudf_column
 
+from pylibcudf cimport DataType as plc_DataType
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -362,7 +362,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -399,7 +399,7 @@ cdef class Column:
         self._data = None
 
         return mutable_column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,
@@ -425,7 +425,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
@@ -451,7 +451,7 @@ cdef class Column:
         cdef libcudf_types.size_type c_null_count = null_count
 
         return column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index c829cac6409..9e34fa4d049 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -22,7 +22,7 @@ from cudf._lib.utils cimport data_from_pylibcudf_io
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from cudf.core.buffer import acquire_spill_lock
 from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
@@ -77,11 +77,7 @@ cpdef read_orc(object filepaths_or_buffers,
         get_skiprows_arg(skip_rows),
         get_num_rows_arg(num_rows),
         use_index,
-        plc.types.DataType(
-            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
-                cudf.dtype(timestamp_type)
-            ]
-        )
+        dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
     )
 
     names = tbl_w_meta.column_names(include_children=False)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 944753d28b8..f049d23aea2 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -6,9 +6,9 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
+from cudf._lib.types cimport dtype_to_pylibcudf_type
 
-import pylibcudf
+import pylibcudf as plc
 
 from cudf._lib.aggregation import make_aggregation
 
@@ -49,13 +49,17 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
 
         return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
 
-    result = pylibcudf.reduce.reduce(
+    result = plc.reduce.reduce(
         incol.to_pylibcudf(mode="read"),
         make_aggregation(reduction_op, kwargs).c_obj,
         dtype_to_pylibcudf_type(col_dtype),
     )
 
-    if is_decimal_type_id(result.type().id()):
+    if result.type().id() in {
+        plc.types.TypeId.DECIMAL128,
+        plc.types.TypeId.DECIMAL32,
+        plc.types.TypeId.DECIMAL64
+    }:
         scale = -result.type().scale()
         precision = _reduce_precision(col_dtype, reduction_op, len(incol))
         return DeviceScalar.from_pylibcudf(
@@ -84,11 +88,11 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
         Flag for including nulls in relevant scan
     """
     return Column.from_pylibcudf(
-        pylibcudf.reduce.scan(
+        plc.reduce.scan(
             incol.to_pylibcudf(mode="read"),
             make_aggregation(scan_op, kwargs).c_obj,
-            pylibcudf.reduce.ScanType.INCLUSIVE if inclusive
-            else pylibcudf.reduce.ScanType.EXCLUSIVE,
+            plc.reduce.ScanType.INCLUSIVE if inclusive
+            else plc.reduce.ScanType.EXCLUSIVE,
         )
     )
 
@@ -107,7 +111,7 @@ def minmax(Column incol):
     -------
     A pair of ``(min, max)`` values of ``incol``
     """
-    min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read"))
+    min, max = plc.reduce.minmax(incol.to_pylibcudf(mode="read"))
     return (
         cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)),
         cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)),
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 56712402919..caa99248a2e 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -10,23 +10,19 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
-from cudf.core.dtypes import ListDtype, StructDtype
+from cudf._lib.types import dtype_from_pylibcudf_column
 from cudf.core.missing import NA, NaT
 
-cimport pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
 from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
-
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 def _replace_nested(obj, check, replacement):
@@ -62,12 +58,12 @@ def gather_metadata(dtypes):
     """
     out = []
     for name, dtype in dtypes.items():
-        v = pylibcudf.interop.ColumnMetadata(name)
+        v = plc.interop.ColumnMetadata(name)
         if isinstance(dtype, cudf.StructDtype):
             v.children_meta = gather_metadata(dtype.fields)
         elif isinstance(dtype, cudf.ListDtype):
             # Offsets column is unnamed and has no children
-            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.append(plc.interop.ColumnMetadata(""))
             v.children_meta.extend(
                 gather_metadata({"": dtype.element_type})
             )
@@ -81,7 +77,7 @@ cdef class DeviceScalar:
     # that from_unique_ptr is implemented is probably dereferencing this in an
     # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar)
+        self.c_value = plc.Scalar.__new__(plc.Scalar)
 
     def __init__(self, value, dtype):
         """
@@ -127,20 +123,20 @@ cdef class DeviceScalar:
             pa_array = pa.array([pa.scalar(value, type=pa_type)])
 
         pa_table = pa.Table.from_arrays([pa_array], names=[""])
-        table = pylibcudf.interop.from_arrow(pa_table)
+        table = plc.interop.from_arrow(pa_table)
 
         column = table.columns()[0]
         if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale)
                 )
             elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale)
                 )
 
-        self.c_value = pylibcudf.copying.get_element(column, 0)
+        self.c_value = plc.copying.get_element(column, 0)
         self._dtype = dtype
 
     def _to_host_scalar(self):
@@ -150,7 +146,7 @@ cdef class DeviceScalar:
         null_type = NaT if is_datetime or is_timedelta else NA
 
         metadata = gather_metadata({"": self.dtype})[0]
-        ps = pylibcudf.interop.to_arrow(self.c_value, metadata)
+        ps = plc.interop.to_arrow(self.c_value, metadata)
         if not ps.is_valid:
             return null_type
 
@@ -225,44 +221,19 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type()
-
         if dtype is not None:
             self._dtype = dtype
-        elif cdtype.id() in {
-            libcudf_types.type_id.DECIMAL32,
-            libcudf_types.type_id.DECIMAL64,
-            libcudf_types.type_id.DECIMAL128,
+
+        plc_scalar = self.c_value
+        if plc_scalar.type().id() in {
+            plc.TypeId.DECIMAL32,
+            plc.TypeId.DECIMAL64,
+            plc.TypeId.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype.id() == libcudf_types.type_id.STRUCT:
-            struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
-            self._dtype = StructDtype({
-                str(i): dtype_from_column_view(struct_table_view.column(i))
-                for i in range(struct_table_view.num_columns())
-            })
-        elif cdtype.id() == libcudf_types.type_id.LIST:
-            if (
-                <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == libcudf_types.type_id.LIST:
-                self._dtype = dtype_from_column_view(
-                    (<list_scalar*>self.get_raw_ptr())[0].view()
-                )
-            else:
-                self._dtype = ListDtype(
-                    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                        <underlying_type_t_type_id>(
-                            (<list_scalar*>self.get_raw_ptr())[0]
-                            .view().type().id()
-                        )
-                    ]
-                )
-        else:
-            self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype.id())
-            ]
+        self._dtype = dtype_from_pylibcudf_column(plc.Column.from_scalar(plc_scalar, 1))
 
 
 def as_device_scalar(val, dtype=None):
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index c2b760490c1..18b1d26e4db 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,16 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-from libcpp cimport bool
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 ctypedef int32_t underlying_type_t_type_id
 
 cdef dtype_from_column_view(column_view cv)
 
-cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
 cpdef dtype_to_pylibcudf_type(dtype)
-cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index f169ea12b10..777bd070b32 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from enum import IntEnum
-
 import numpy as np
 import pandas as pd
 
@@ -11,138 +9,46 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 
 
-class TypeId(IntEnum):
-    EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
-    INT8 = <underlying_type_t_type_id> libcudf_types.type_id.INT8
-    INT16 = <underlying_type_t_type_id> libcudf_types.type_id.INT16
-    INT32 = <underlying_type_t_type_id> libcudf_types.type_id.INT32
-    INT64 = <underlying_type_t_type_id> libcudf_types.type_id.INT64
-    UINT8 = <underlying_type_t_type_id> libcudf_types.type_id.UINT8
-    UINT16 = <underlying_type_t_type_id> libcudf_types.type_id.UINT16
-    UINT32 = <underlying_type_t_type_id> libcudf_types.type_id.UINT32
-    UINT64 = <underlying_type_t_type_id> libcudf_types.type_id.UINT64
-    FLOAT32 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT32
-    FLOAT64 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT64
-    BOOL8 = <underlying_type_t_type_id> libcudf_types.type_id.BOOL8
-    TIMESTAMP_DAYS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_DAYS
-    )
-    TIMESTAMP_SECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_SECONDS
-    )
-    TIMESTAMP_MILLISECONDS = (
-        <underlying_type_t_type_id> (
-            libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-        )
-    )
-    TIMESTAMP_MICROSECONDS = (
-        <underlying_type_t_type_id> (
-            libcudf_types.type_id.TIMESTAMP_MICROSECONDS
-        )
-    )
-    TIMESTAMP_NANOSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_NANOSECONDS
-    )
-    DURATION_SECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_SECONDS
-    )
-    DURATION_MILLISECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MILLISECONDS
-    )
-    DURATION_MICROSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MICROSECONDS
-    )
-    DURATION_NANOSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_NANOSECONDS
-    )
-    STRING = <underlying_type_t_type_id> libcudf_types.type_id.STRING
-    DECIMAL32 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL32
-    DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64
-    DECIMAL128 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL128
-    STRUCT = <underlying_type_t_type_id> libcudf_types.type_id.STRUCT
-
-
-SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
-    np.dtype("int8"): TypeId.INT8,
-    np.dtype("int16"): TypeId.INT16,
-    np.dtype("int32"): TypeId.INT32,
-    np.dtype("int64"): TypeId.INT64,
-    np.dtype("uint8"): TypeId.UINT8,
-    np.dtype("uint16"): TypeId.UINT16,
-    np.dtype("uint32"): TypeId.UINT32,
-    np.dtype("uint64"): TypeId.UINT64,
-    np.dtype("float32"): TypeId.FLOAT32,
-    np.dtype("float64"): TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): TypeId.STRING,
-    np.dtype("bool"): TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
-}
-
 SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
-    k: pylibcudf.TypeId(v).value
-    for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items()
+    np.dtype("int8"): plc.types.TypeId.INT8,
+    np.dtype("int16"): plc.types.TypeId.INT16,
+    np.dtype("int32"): plc.types.TypeId.INT32,
+    np.dtype("int64"): plc.types.TypeId.INT64,
+    np.dtype("uint8"): plc.types.TypeId.UINT8,
+    np.dtype("uint16"): plc.types.TypeId.UINT16,
+    np.dtype("uint32"): plc.types.TypeId.UINT32,
+    np.dtype("uint64"): plc.types.TypeId.UINT64,
+    np.dtype("float32"): plc.types.TypeId.FLOAT32,
+    np.dtype("float64"): plc.types.TypeId.FLOAT64,
+    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
+    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
+    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
+    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
+    np.dtype("object"): plc.types.TypeId.STRING,
+    np.dtype("bool"): plc.types.TypeId.BOOL8,
+    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
+    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
+    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
+    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
 }
-
-LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    # There's no equivalent to EMPTY in cudf.  We translate EMPTY
-    # columns from libcudf to ``int8`` columns of all nulls in Python.
-    # ``int8`` is chosen because it uses the least amount of memory.
-    TypeId.EMPTY: np.dtype("int8"),
-    TypeId.INT8: np.dtype("int8"),
-    TypeId.INT16: np.dtype("int16"),
-    TypeId.INT32: np.dtype("int32"),
-    TypeId.INT64: np.dtype("int64"),
-    TypeId.UINT8: np.dtype("uint8"),
-    TypeId.UINT16: np.dtype("uint16"),
-    TypeId.UINT32: np.dtype("uint32"),
-    TypeId.UINT64: np.dtype("uint64"),
-    TypeId.FLOAT32: np.dtype("float32"),
-    TypeId.FLOAT64: np.dtype("float64"),
-    TypeId.BOOL8: np.dtype("bool"),
-    TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"),
-    TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"),
-    TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"),
-    TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"),
-    TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"),
-    TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"),
-    TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"),
-    TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"),
-    TypeId.STRING: np.dtype("object"),
-    TypeId.STRUCT: np.dtype("object"),
-}
-
 PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    pylibcudf.TypeId(k).value: v
-    for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()
+    plc_type: np_type
+    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
 }
+# There's no equivalent to EMPTY in cudf.  We translate EMPTY
+# columns from libcudf to ``int8`` columns of all nulls in Python.
+# ``int8`` is chosen because it uses the least amount of memory.
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
 
-duration_unit_map = {
-    TypeId.DURATION_SECONDS: "s",
-    TypeId.DURATION_MILLISECONDS: "ms",
-    TypeId.DURATION_MICROSECONDS: "us",
-    TypeId.DURATION_NANOSECONDS: "ns"
-}
-
-datetime_unit_map = {
-    TypeId.TIMESTAMP_SECONDS: "s",
-    TypeId.TIMESTAMP_MILLISECONDS: "ms",
-    TypeId.TIMESTAMP_MICROSECONDS: "us",
-    TypeId.TIMESTAMP_NANOSECONDS: "ns",
-}
 
-size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
 
 
 cdef dtype_from_lists_column_view(column_view cv):
@@ -190,71 +96,40 @@ cdef dtype_from_column_view(column_view cv):
             scale=-cv.type().scale()
         )
     else:
-        return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
             <underlying_type_t_type_id>(tid)
         ]
 
-cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
-    # Note: This function is to be phased out in favor of
-    # dtype_to_pylibcudf_type which will return a pylibcudf
-    # DataType object
-    cdef libcudf_types.type_id tid
-    if isinstance(dtype, cudf.ListDtype):
-        tid = libcudf_types.type_id.LIST
-    elif isinstance(dtype, cudf.StructDtype):
-        tid = libcudf_types.type_id.STRUCT
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = libcudf_types.type_id.DECIMAL128
-    elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = libcudf_types.type_id.DECIMAL64
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = libcudf_types.type_id.DECIMAL32
-    else:
-        tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)]))
-
-    if is_decimal_type_id(tid):
-        return libcudf_types.data_type(tid, -dtype.scale)
-    else:
-        return libcudf_types.data_type(tid)
 
 cpdef dtype_to_pylibcudf_type(dtype):
     if isinstance(dtype, cudf.ListDtype):
-        return pylibcudf.DataType(pylibcudf.TypeId.LIST)
+        return plc.DataType(plc.TypeId.LIST)
     elif isinstance(dtype, cudf.StructDtype):
-        return pylibcudf.DataType(pylibcudf.TypeId.STRUCT)
+        return plc.DataType(plc.TypeId.STRUCT)
     elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = pylibcudf.TypeId.DECIMAL128
-        return pylibcudf.DataType(tid, -dtype.scale)
+        tid = plc.TypeId.DECIMAL128
+        return plc.DataType(tid, -dtype.scale)
     elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = pylibcudf.TypeId.DECIMAL64
-        return pylibcudf.DataType(tid, -dtype.scale)
+        tid = plc.TypeId.DECIMAL64
+        return plc.DataType(tid, -dtype.scale)
     elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = pylibcudf.TypeId.DECIMAL32
-        return pylibcudf.DataType(tid, -dtype.scale)
-    # libcudf types don't support localization so convert to the base type
+        tid = plc.TypeId.DECIMAL32
+        return plc.DataType(tid, -dtype.scale)
+    # libcudf types don't support timezones so convert to the base type
     elif isinstance(dtype, pd.DatetimeTZDtype):
         dtype = np.dtype(f"<M8[{dtype.unit}]")
     else:
         dtype = np.dtype(dtype)
-    return pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
-
-cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
-    return tid in (
-        libcudf_types.type_id.DECIMAL128,
-        libcudf_types.type_id.DECIMAL64,
-        libcudf_types.type_id.DECIMAL32,
-    )
+    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
 
 
 def dtype_from_pylibcudf_lists_column(col):
     child = col.list_view().child()
     tid = child.type().id()
 
-    if tid == pylibcudf.TypeId.LIST:
+    if tid == plc.TypeId.LIST:
         return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
-    elif tid == pylibcudf.TypeId.EMPTY:
+    elif tid == plc.TypeId.EMPTY:
         return cudf.ListDtype("int8")
     else:
         return cudf.ListDtype(
@@ -274,26 +149,24 @@ def dtype_from_pylibcudf_column(col):
     type_ = col.type()
     tid = type_.id()
 
-    if tid == pylibcudf.TypeId.LIST:
+    if tid == plc.TypeId.LIST:
         return dtype_from_pylibcudf_lists_column(col)
-    elif tid == pylibcudf.TypeId.STRUCT:
+    elif tid == plc.TypeId.STRUCT:
         return dtype_from_pylibcudf_structs_column(col)
-    elif tid == pylibcudf.TypeId.DECIMAL64:
+    elif tid == plc.TypeId.DECIMAL64:
         return cudf.Decimal64Dtype(
             precision=cudf.Decimal64Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
-    elif tid == pylibcudf.TypeId.DECIMAL32:
+    elif tid == plc.TypeId.DECIMAL32:
         return cudf.Decimal32Dtype(
             precision=cudf.Decimal32Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
-    elif tid == pylibcudf.TypeId.DECIMAL128:
+    elif tid == plc.TypeId.DECIMAL128:
         return cudf.Decimal128Dtype(
             precision=cudf.Decimal128Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
     else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-            <underlying_type_t_type_id>(tid)
-        ]
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c849a9d3d2b..f6702165e29 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1189,10 +1189,10 @@ def _concat(
         codes = [o.codes for o in objs]
 
         newsize = sum(map(len, codes))
-        if newsize > libcudf.MAX_COLUMN_SIZE:
+        if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
             raise MemoryError(
                 f"Result of concat cannot have "
-                f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+                f"size > {libcudf.types.size_type_dtype}_MAX"
             )
         elif newsize == 0:
             codes_col = column.column_empty(0, head.codes.dtype, masked=True)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1ddc79e8970..6be5382db79 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2362,10 +2362,10 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         )
 
     newsize = sum(map(len, objs))
-    if newsize > libcudf.MAX_COLUMN_SIZE:
+    if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
         raise MemoryError(
             f"Result of concat cannot have "
-            f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+            f"size > {libcudf.types.size_type_dtype}_MAX"
         )
     elif newsize == 0:
         return column_empty(0, head.dtype, masked=True)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 801020664da..1151b429280 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -57,7 +57,9 @@ def dtype(arbitrary):
     else:
         if np_dtype.kind in set("OU"):
             return np.dtype("object")
-        elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
+        elif (
+            np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+        ):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 57bf08e6eec..ca8f9cac2d0 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -151,7 +151,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif cudf.api.types.is_decimal128_dtype(dtype):
         return cudf.core.dtypes.Decimal128Dtype
-    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
+    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)

From 23f79919e06adcfc3d71f3af816fb84a05f52f0c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 17 Dec 2024 19:45:35 -0800
Subject: [PATCH 2/2] Revert some old pieces of DeviceScalar._set_dtype

---
 python/cudf/cudf/_lib/scalar.pyx | 46 +++++++++++++++++++++++++-------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 1921460fff6..40bd50acf16 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -13,7 +13,9 @@ from libcpp.utility cimport move
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import dtype_from_pylibcudf_column
+from cudf.core.dtypes import ListDtype, StructDtype
+from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
 
 # We currently need this cimport because some of the implementations here
@@ -21,8 +23,8 @@ from cudf.core.missing import NA, NaT
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar
-from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
+from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
 
 def _replace_nested(obj, check, replacement):
@@ -221,19 +223,43 @@ cdef class DeviceScalar:
         return s
 
     cdef void _set_dtype(self, dtype=None):
+        cdef plc_TypeID cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
-
-        plc_scalar = self.c_value
-        if plc_scalar.type().id() in {
-            plc.TypeId.DECIMAL32,
-            plc.TypeId.DECIMAL64,
-            plc.TypeId.DECIMAL128,
+        elif cdtype_id in {
+            plc_TypeID.DECIMAL32,
+            plc_TypeID.DECIMAL64,
+            plc_TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        self._dtype = dtype_from_pylibcudf_column(plc.Column.from_scalar(plc_scalar, 1))
+        elif cdtype_id == plc_TypeID.STRUCT:
+            struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
+            self._dtype = StructDtype({
+                str(i): dtype_from_column_view(struct_table_view.column(i))
+                for i in range(struct_table_view.num_columns())
+            })
+        elif cdtype_id == plc_TypeID.LIST:
+            if (
+                <list_scalar*>self.get_raw_ptr()
+            )[0].view().type().id() == plc_TypeID.LIST:
+                self._dtype = dtype_from_column_view(
+                    (<list_scalar*>self.get_raw_ptr())[0].view()
+                )
+            else:
+                self._dtype = ListDtype(
+                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                        <underlying_type_t_type_id>(
+                            (<list_scalar*>self.get_raw_ptr())[0]
+                            .view().type().id()
+                        )
+                    ]
+                )
+        else:
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                <underlying_type_t_type_id>(cdtype_id)
+            ]
 
 
 def as_device_scalar(val, dtype=None):