diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5e2f46714d9..e0b315f34fc 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -8,10 +8,9 @@ notebooks/         @rapidsai/cudf-python-codeowners
 python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 
 #cmake code owners
-cpp/CMakeLists.txt               @rapidsai/cudf-cmake-codeowners
-cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
-**/cmake/                        @rapidsai/cudf-cmake-codeowners
-*.cmake                          @rapidsai/cudf-cmake-codeowners
+CMakeLists.txt @rapidsai/cudf-cmake-codeowners
+**/cmake/      @rapidsai/cudf-cmake-codeowners
+*.cmake        @rapidsai/cudf-cmake-codeowners
 
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index fb7182f4133..65aebfb7f8c 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -62,7 +62,7 @@ jobs:
       arch: "amd64"
       branch: ${{ inputs.branch }}
       build_type: ${{ inputs.build_type || 'branch' }}
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       date: ${{ inputs.date }}
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 7c0bd6d52e2..e955b8f1f80 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -13,6 +13,7 @@ jobs:
   # Please keep pr-builder as the top job here
   pr-builder:
     needs:
+      - check-nightly-ci
       - changed-files
       - checks
       - conda-cpp-build
@@ -52,7 +53,20 @@ jobs:
       OTEL_SERVICE_NAME: 'pr-cudf'
     steps:
       - name: Telemetry setup
+        if: ${{ vars.TELEMETRY_ENABLED == 'true' }}
         uses: rapidsai/shared-actions/telemetry-dispatch-stash-base-env-vars@main
+  check-nightly-ci:
+    # Switch to ubuntu-latest once it defaults to a version of Ubuntu that
+    # provides at least Python 3.11 (see
+    # https://docs.python.org/3/library/datetime.html#datetime.date.fromisoformat)
+    runs-on: ubuntu-24.04
+    env:
+      RAPIDS_GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Check if nightly CI is passing
+        uses: rapidsai/shared-actions/check_nightly_success/dispatch@main
+        with:
+          repo: cudf
   changed-files:
     secrets: inherit
     needs: telemetry-setup
@@ -172,7 +186,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
@@ -193,7 +207,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   docs-build:
     needs: conda-python-build
@@ -203,7 +217,7 @@ jobs:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
@@ -327,16 +341,11 @@ jobs:
         run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
 
   telemetry-summarize:
-    runs-on: ubuntu-latest
+    # This job must use a self-hosted runner to record telemetry traces.
+    runs-on: linux-amd64-cpu4
     needs: pr-builder
-    if: always()
+    if: ${{ vars.TELEMETRY_ENABLED == 'true' && !cancelled() }}
     continue-on-error: true
     steps:
-      - name: Load stashed telemetry env vars
-        uses: rapidsai/shared-actions/telemetry-dispatch-load-base-env-vars@main
-        with:
-            load_service_name: true
       - name: Telemetry summarize
-        uses: rapidsai/shared-actions/telemetry-dispatch-write-summary@main
-        with:
-          cert_concat: "${{ secrets.OTEL_EXPORTER_OTLP_CA_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_CERTIFICATE }};${{ secrets.OTEL_EXPORTER_OTLP_CLIENT_KEY }}"
+        uses: rapidsai/shared-actions/telemetry-dispatch-summarize@main
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 858352f515d..dc82c17022a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -41,7 +41,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
@@ -94,7 +94,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
@@ -106,7 +106,7 @@ jobs:
       sha: ${{ inputs.sha }}
       node_type: "gpu-v100-latest-1"
       arch: "amd64"
-      container_image: "rapidsai/ci-conda:cuda12.5.1-ubuntu22.04-py3.11"
+      container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
diff --git a/.github/workflows/trigger-breaking-change-alert.yaml b/.github/workflows/trigger-breaking-change-alert.yaml
index 3b972f31ca4..01dd2436beb 100644
--- a/.github/workflows/trigger-breaking-change-alert.yaml
+++ b/.github/workflows/trigger-breaking-change-alert.yaml
@@ -12,7 +12,7 @@ jobs:
   trigger-notifier:
     if: contains(github.event.pull_request.labels.*.name, 'breaking')
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-24.12
+    uses: rapidsai/shared-workflows/.github/workflows/breaking-change-alert.yaml@branch-25.02
     with:
       sender_login: ${{ github.event.sender.login }}
       sender_avatar: ${{ github.event.sender.avatar_url }}
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 4290d013fe4..52d8f659611 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -35,6 +35,10 @@ rapids-mamba-retry install \
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
+EXITCODE=0
+trap "EXITCODE=1" ERR
+set +e
+
 rapids-logger "Build CPP docs"
 pushd cpp/doxygen
 aws s3 cp s3://rapidsai-docs/librmm/html/${RAPIDS_VERSION_MAJOR_MINOR}/rmm.tag . || echo "Failed to download rmm Doxygen tag"
@@ -58,3 +62,5 @@ mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 popd
 
 RAPIDS_VERSION_NUMBER="${RAPIDS_VERSION_MAJOR_MINOR}" rapids-upload-docs
+
+exit ${EXITCODE}
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index af49942c8cd..d80e4fef0d0 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name="libcudf"
 package_dir="python/libcudf"
 
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
 rapids-logger "Generating build requirements"
 
 rapids-dependency-file-generator \
@@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
 mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair \
     --exclude libnvcomp.so.4 \
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index f8ddbaba0f3..30e3ffc9a43 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -26,6 +26,8 @@ main() {
     LIBS=${LIBS#[}
     LIBS=${LIBS%]}
 
+    ANY_FAILURES=0
+
     for lib in ${LIBS//,/ }; do
         lib=$(echo "$lib" | tr -d '""')
         echo "Running tests for library $lib"
@@ -56,10 +58,6 @@ main() {
         rapids-logger "Check GPU usage"
         nvidia-smi
 
-        EXITCODE=0
-        trap "EXITCODE=1" ERR
-        set +e
-
         rapids-logger "pytest ${lib}"
 
         NUM_PROCESSES=8
@@ -72,12 +70,20 @@ main() {
             fi
         done
 
+        EXITCODE=0
+        trap "EXITCODE=1" ERR
+        set +e
+
         TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
 
+        set -e
         rapids-logger "Test script exiting with value: ${EXITCODE}"
+        if [[ ${EXITCODE} != 0 ]]; then
+            ANY_FAILURES=1
+        fi
     done
 
-    exit ${EXITCODE}
+    exit ${ANY_FAILURES}
 }
 
 main "$@"
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index db86721755d..3c6dba72164 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Support invoking test_python_cudf.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -24,8 +24,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf (dask-expr)"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf"
+./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-rapids-logger "pytest dask_cudf (legacy)"
-DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-
 rapids-logger "pytest cudf_kafka"
 ./ci/run_cudf_kafka_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index e15949f4bdb..44f430ce98d 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
@@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf (dask-expr)"
+rapids-logger "pytest dask_cudf"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
+python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
   .
 popd
-
-# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
-rapids-logger "pytest dask_cudf (legacy)"
-pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-popd
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 87c40421be0..a8e5018b283 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -19,7 +19,7 @@ dependencies:
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
-- cuda-python>=11.7.1,<12.0a0
+- cuda-python>=11.8.5,<12.0a0
 - cuda-sanitizer-api=11.8.86
 - cuda-version=11.8
 - cudatoolkit
@@ -55,7 +55,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13,<0.0.18
+- numba-cuda>=0.2.0,<0.3.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
@@ -66,12 +66,12 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.15
+- polars>=1.11,<1.18
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
-- pynvml>=11.4.1,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
@@ -87,7 +87,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 0935de96d19..6dc99b14f5d 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
-- cuda-python>=12.0,<13.0a0
+- cuda-python>=12.6.2,<13.0a0
 - cuda-sanitizer-api
 - cuda-version=12.5
 - cupy>=12.0.0
@@ -54,7 +54,7 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.0.13,<0.0.18
+- numba-cuda>=0.2.0,<0.3.0
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==4.1.0.6
@@ -64,12 +64,12 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.15
+- polars>=1.11,<1.18
 - pre-commit
 - pyarrow>=14.0.0,<19.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
-- pynvml>=11.4.1,<12.0.0a0
+- pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
@@ -86,7 +86,6 @@ dependencies:
 - s3fs>=2022.3.0
 - scikit-build-core>=0.10.0
 - scipy
-- spdlog>=1.14.1,<1.15
 - sphinx
 - sphinx-autobuild
 - sphinx-copybutton
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index b6c03dc1bc2..7a0005497df 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.15
+    - polars >=1.11,<1.18
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index e52b8c5f2a0..b34496cc256 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -80,7 +80,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.0.13,<0.0.18
+    - numba-cuda >=0.2.0,<0.3.0
     - numpy >=1.23,<3.0a0
     - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
@@ -91,7 +91,7 @@ requirements:
     - cudatoolkit
     - ptxcompiler >=0.7.0
     - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
     - cuda-cudart
     - libcufile  # [linux64]
@@ -100,7 +100,7 @@ requirements:
     # TODO: Add nvjitlink here
     # xref: https://github.com/rapidsai/cudf/issues/12822
     - cuda-nvrtc
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     - pynvjitlink
     {% endif %}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 74ecded8ead..a476d5d53df 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - cudf ={{ version }}
-    - pynvml >=11.4.1,<12.0.0a0
+    - pynvml >=12.0.0,<13.0.0a0
     - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c78ca326005..00020fdf6b8 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -31,9 +31,6 @@ fmt_version:
 flatbuffers_version:
   - "=24.3.25"
 
-spdlog_version:
-  - ">=1.14.1,<1.15"
-
 nvcomp_version:
   - "=4.1.0.6"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 1c2e9e8dd98..b585aafc397 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -68,7 +68,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - flatbuffers {{ flatbuffers_version }}
-    - spdlog {{ spdlog_version }}
     - zlib {{ zlib_version }}
 
 outputs:
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 3d965f30986..08eab363af0 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -83,9 +83,9 @@ requirements:
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
-    - cuda-python >=11.7.1,<12.0a0
+    - cuda-python >=11.8.5,<12.0a0
     {% else %}
-    - cuda-python >=12.0,<13.0a0
+    - cuda-python >=12.6.2,<13.0a0
     {% endif %}
     - nvtx >=0.2.1
     - packaging
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 12e6826f301..9dabe4e8800 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -273,6 +273,11 @@ endif()
 
 # add third party dependencies using CPM
 rapids_cpm_init()
+
+include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
+rapids_cpm_rapids_logger()
+rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN)
+
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
 # find NVTX
@@ -299,8 +304,6 @@ include(cmake/Modules/JitifyPreprocessKernels.cmake)
 include(cmake/thirdparty/get_kvikio.cmake)
 # find fmt
 include(cmake/thirdparty/get_fmt.cmake)
-# find spdlog
-include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
 # find thread_pool
@@ -440,7 +443,6 @@ add_library(
   src/groupby/sort/group_quantiles.cu
   src/groupby/sort/group_std.cu
   src/groupby/sort/group_sum.cu
-  src/groupby/sort/scan.cpp
   src/groupby/sort/group_count_scan.cu
   src/groupby/sort/group_max_scan.cu
   src/groupby/sort/group_min_scan.cu
@@ -448,6 +450,8 @@ add_library(
   src/groupby/sort/group_rank_scan.cu
   src/groupby/sort/group_replace_nulls.cu
   src/groupby/sort/group_sum_scan.cu
+  src/groupby/sort/host_udf_aggregation.cpp
+  src/groupby/sort/scan.cpp
   src/groupby/sort/sort_helper.cu
   src/hash/md5_hash.cu
   src/hash/murmurhash3_x86_32.cu
@@ -457,6 +461,7 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
+  src/hash/xxhash_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/arrow_utilities.cpp
@@ -772,7 +777,6 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
-  src/utilities/logger.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
@@ -910,11 +914,10 @@ if(CUDF_LARGE_STRINGS_DISABLED)
   target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
 endif()
 
-# Define RMM logging level
-target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")
-
-# Define spdlog level
-target_compile_definitions(cudf PUBLIC "SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}")
+# Define logging level
+target_compile_definitions(
+  cudf PRIVATE "CUDF_LOG_ACTIVE_LEVEL=CUDF_LOG_LEVEL_${LIBCUDF_LOGGING_LEVEL}"
+)
 
 # Enable remote IO through KvikIO
 target_compile_definitions(cudf PRIVATE $<$<BOOL:${CUDF_KVIKIO_REMOTE_IO}>:CUDF_KVIKIO_REMOTE_IO>)
@@ -928,14 +931,17 @@ if(TARGET CUDA::cuFile${_cufile_suffix})
   target_compile_definitions(cudf PRIVATE CUDF_CUFILE_FOUND)
 endif()
 
+# Remove this after upgrading to a CCCL that has a proper CMake option. See
+# https://github.com/NVIDIA/cccl/pull/2844
+target_compile_definitions(cudf PRIVATE THRUST_FORCE_32_BIT_OFFSET_TYPE=1)
+
 # Compile stringified JIT sources first
 add_dependencies(cudf jitify_preprocess_run)
 
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
-         spdlog::spdlog_header_only
+  PUBLIC CCCL::CCCL rmm::rmm rmm::rmm_logger $<BUILD_LOCAL_INTERFACE:BS::thread_pool> cudf_logger
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp>
           cuco::cuco
           ZLIB::ZLIB
@@ -944,6 +950,7 @@ target_link_libraries(
           $<TARGET_NAME_IF_EXISTS:CUDA::cuFile${_cufile_suffix}>
           nanoarrow
           rmm::rmm_logger_impl
+          cudf_logger_impl
 )
 
 # Add Conda library, and include paths if specified
@@ -1099,7 +1106,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm rmm::rmm_logger rmm::rmm_logger_impl)
     if(CUDF_BUILD_STACKTRACE_DEBUG)
       target_link_libraries(${_tgt} PRIVATE cudf_backtrace)
     endif()
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8e5ea900efa..0ff712c1c77 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -140,8 +140,9 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
 endfunction()
 
 # ##################################################################################################
-# * column benchmarks -----------------------------------------------------------------------------
-ConfigureBench(COLUMN_CONCAT_BENCH column/concatenate.cpp)
+# * copying benchmarks
+# -----------------------------------------------------------------------------
+ConfigureNVBench(COPYING_NVBENCH copying/concatenate.cpp)
 
 # ##################################################################################################
 # * gather benchmark ------------------------------------------------------------------------------
@@ -351,11 +352,18 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
-ConfigureBench(TEXT_BENCH text/subword.cpp)
-
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH
+  text/edit_distance.cpp
+  text/hash_ngrams.cpp
+  text/jaccard.cpp
+  text/minhash.cpp
+  text/ngrams.cpp
+  text/normalize.cpp
+  text/replace.cpp
+  text/subword.cpp
+  text/tokenize.cpp
+  text/vocab.cpp
 )
 
 # ##################################################################################################
@@ -417,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
 
+# ##################################################################################################
+# * rolling benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
deleted file mode 100644
index 51106c72137..00000000000
--- a/cpp/benchmarks/column/concatenate.cpp
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/fixture/templated_benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
-#include <cudf_test/column_wrapper.hpp>
-
-#include <cudf/concatenate.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <algorithm>
-#include <vector>
-
-class Concatenate : public cudf::benchmark {};
-
-template <typename T, bool Nullable>
-static void BM_concatenate(benchmark::State& state)
-{
-  cudf::size_type const num_rows = state.range(0);
-  cudf::size_type const num_cols = state.range(1);
-
-  auto input         = create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                     row_count{num_rows},
-                                     Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  auto input_columns = input->view();
-  std::vector<cudf::column_view> column_views(input_columns.begin(), input_columns.end());
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * sizeof(T));
-}
-
-#define CONCAT_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                    \
-    ->Ranges({{1 << 6, 1 << 18}, {2, 1024}})                                \
-    ->Unit(benchmark::kMillisecond)                                         \
-    ->UseManualTime();
-
-CONCAT_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_BENCHMARK_DEFINE(int64_t, true)
-
-template <typename T, bool Nullable>
-static void BM_concatenate_tables(benchmark::State& state)
-{
-  cudf::size_type const num_rows   = state.range(0);
-  cudf::size_type const num_cols   = state.range(1);
-  cudf::size_type const num_tables = state.range(2);
-
-  std::vector<std::unique_ptr<cudf::table>> tables(num_tables);
-  std::generate_n(tables.begin(), num_tables, [&]() {
-    return create_sequence_table(cycle_dtypes({cudf::type_to_id<T>()}, num_cols),
-                                 row_count{num_rows},
-                                 Nullable ? std::optional<double>{2.0 / 3.0} : std::nullopt);
-  });
-
-  // Generate table views
-  std::vector<cudf::table_view> table_views(num_tables);
-  std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) mutable {
-    return table->view();
-  });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(table_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows * num_tables * sizeof(T));
-}
-
-#define CONCAT_TABLES_BENCHMARK_DEFINE(type, nullable)                             \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_tables<type, nullable>(st); }         \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_tables##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                           \
-    ->Ranges({{1 << 8, 1 << 12}, {2, 32}, {2, 128}})                               \
-    ->Unit(benchmark::kMillisecond)                                                \
-    ->UseManualTime();
-
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, false)
-CONCAT_TABLES_BENCHMARK_DEFINE(int64_t, true)
-
-class ConcatenateStrings : public cudf::benchmark {};
-
-template <bool Nullable>
-static void BM_concatenate_strings(benchmark::State& state)
-{
-  using column_wrapper = cudf::test::strings_column_wrapper;
-
-  auto const num_rows  = state.range(0);
-  auto const num_chars = state.range(1);
-  auto const num_cols  = state.range(2);
-
-  std::string str(num_chars, 'a');
-
-  // Create owning columns
-  std::vector<column_wrapper> columns;
-  columns.reserve(num_cols);
-  std::generate_n(std::back_inserter(columns), num_cols, [num_rows, c_str = str.c_str()]() {
-    auto iter = thrust::make_constant_iterator(c_str);
-    if (Nullable) {
-      auto count_it = thrust::make_counting_iterator(0);
-      auto valid_iter =
-        thrust::make_transform_iterator(count_it, [](auto i) { return i % 3 == 0; });
-      return column_wrapper(iter, iter + num_rows, valid_iter);
-    } else {
-      return column_wrapper(iter, iter + num_rows);
-    }
-  });
-
-  // Generate column views
-  std::vector<cudf::column_view> column_views;
-  column_views.reserve(columns.size());
-  std::transform(
-    columns.begin(), columns.end(), std::back_inserter(column_views), [](auto const& col) {
-      return static_cast<cudf::column_view>(col);
-    });
-
-  CUDF_CHECK_CUDA(0);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    auto result = cudf::concatenate(column_views);
-  }
-
-  state.SetBytesProcessed(state.iterations() * num_cols * num_rows *
-                          (sizeof(int32_t) + num_chars));  // offset + chars
-}
-
-#define CONCAT_STRINGS_BENCHMARK_DEFINE(nullable)                                   \
-  BENCHMARK_DEFINE_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable)   \
-  (::benchmark::State & st) { BM_concatenate_strings<nullable>(st); }               \
-  BENCHMARK_REGISTER_F(Concatenate, BM_concatenate_strings##_##nullable_##nullable) \
-    ->RangeMultiplier(8)                                                            \
-    ->Ranges({{1 << 8, 1 << 14}, {8, 128}, {2, 256}})                               \
-    ->Unit(benchmark::kMillisecond)                                                 \
-    ->UseManualTime();
-
-CONCAT_STRINGS_BENCHMARK_DEFINE(false)
-CONCAT_STRINGS_BENCHMARK_DEFINE(true)
diff --git a/cpp/benchmarks/copying/concatenate.cpp b/cpp/benchmarks/copying/concatenate.cpp
new file mode 100644
index 00000000000..586b479d0ad
--- /dev/null
+++ b/cpp/benchmarks/copying/concatenate.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+static void bench_concatenate(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const nulls    = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  auto input = create_sequence_table(
+    cycle_dtypes({cudf::type_to_id<int64_t>()}, num_cols), row_count{num_rows}, nulls);
+  auto input_columns = input->view();
+  auto column_views  = std::vector<cudf::column_view>(input_columns.begin(), input_columns.end());
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_global_memory_reads<int64_t>(num_rows * num_cols);
+  state.add_global_memory_writes<int64_t>(num_rows * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate)
+  .set_name("concatenate")
+  .add_int64_axis("num_rows", {64, 512, 4096, 32768, 262144})
+  .add_int64_axis("num_cols", {2, 8, 64, 512, 1024})
+  .add_float64_axis("nulls", {0.0, 0.3});
+
+static void bench_concatenate_strings(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const nulls     = static_cast<cudf::size_type>(state.get_float64("nulls"));
+
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .null_probability(nulls);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
+  auto const input  = column->view();
+
+  auto column_views = std::vector<cudf::column_view>(num_cols, input);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const sv = cudf::strings_column_view(input);
+  state.add_global_memory_reads<int8_t>(sv.chars_size(stream) * num_cols);
+  state.add_global_memory_writes<int64_t>(sv.chars_size(stream) * num_cols);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { auto result = cudf::concatenate(column_views); });
+}
+
+NVBENCH_BENCH(bench_concatenate_strings)
+  .set_name("concatenate_strings")
+  .add_int64_axis("num_rows", {256, 512, 4096, 16384})
+  .add_int64_axis("num_cols", {2, 8, 64, 256})
+  .add_int64_axis("row_width", {32, 128})
+  .add_float64_axis("nulls", {0.0, 0.3});
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 45b46005c47..38a21961735 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -17,7 +17,7 @@
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 3502cbcea2a..1085b03ac7b 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,13 +23,8 @@ void distinct_inner_join(nvbench::state& state,
   auto join = [](cudf::table_view const& probe_input,
                  cudf::table_view const& build_input,
                  cudf::null_equality compare_nulls) {
-    auto const has_nulls =
-      cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
-        ? cudf::nullable_join::YES
-        : cudf::nullable_join::NO;
-    auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls};
-    return hj_obj.inner_join();
+    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls};
+    return hj_obj.inner_join(probe_input);
   };
 
   BM_join<Key, Nullable>(state, join);
@@ -42,13 +37,8 @@ void distinct_left_join(nvbench::state& state,
   auto join = [](cudf::table_view const& probe_input,
                  cudf::table_view const& build_input,
                  cudf::null_equality compare_nulls) {
-    auto const has_nulls =
-      cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
-        ? cudf::nullable_join::YES
-        : cudf::nullable_join::NO;
-    auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls};
-    return hj_obj.left_join();
+    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls};
+    return hj_obj.left_join(probe_input);
   };
 
   BM_join<Key, Nullable>(state, join);
diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
new file mode 100644
index 00000000000..04afe5ac661
--- /dev/null
+++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const cardinality    = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    auto keys = create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+    return cudf::sort(cudf::table_view{{keys->view()}});
+  }();
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result = cudf::grouped_rolling_window(
+      keys->view(), vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_grouped_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 28})
+  .add_int64_axis("preceding_size", {1, 10})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1})
+  .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000});
diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp
new file mode 100644
index 00000000000..af9ecd6a26f
--- /dev/null
+++ b/cpp/benchmarks/rolling/rolling_sum.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+
+template <typename Type>
+void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+  auto const min_periods    = static_cast<cudf::size_type>(state.get_int64("min_periods"));
+
+  data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  auto vals = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+template <typename Type>
+void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const preceding_size = static_cast<cudf::size_type>(state.get_int64("preceding_size"));
+  auto const following_size = static_cast<cudf::size_type>(state.get_int64("following_size"));
+
+  auto vals = [&]() {
+    data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+    return create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
+  }();
+
+  auto preceding = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) {
+      return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto following = [&]() {
+    auto data = std::vector<cudf::size_type>(num_rows);
+    auto it   = thrust::make_counting_iterator<cudf::size_type>(0);
+    std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) {
+      return std::max(-i - 1, std::min(following_size, num_rows - i - 1));
+    });
+    auto buf = rmm::device_buffer(
+      data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream());
+    cudf::get_default_stream().synchronize();
+    return std::make_unique<cudf::column>(cudf::data_type(cudf::type_to_id<cudf::size_type>()),
+                                          num_rows,
+                                          std::move(buf),
+                                          rmm::device_buffer{},
+                                          0);
+  }();
+
+  auto req = cudf::make_sum_aggregation<cudf::rolling_aggregation>();
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto const result =
+      cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req);
+  });
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_fixed_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {1, 10, 100})
+  .add_int64_axis("following_size", {2})
+  .add_int64_axis("min_periods", {1, 20});
+
+NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<std::int32_t, double>))
+  .set_name("row_variable_rolling_sum")
+  .add_int64_power_of_two_axis("num_rows", {14, 22, 28})
+  .add_int64_axis("preceding_size", {10, 100})
+  .add_int64_axis("following_size", {2});
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index fa017ca9e29..267aa3a93f3 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -63,8 +63,8 @@ void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<Data
   data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
     input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table =
-    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
+  auto source_table = create_random_table(
+    cycle_dtypes({input_type, cudf::type_id::STRING}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
@@ -85,6 +85,6 @@ using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view
 NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
   .set_name("apply_boolean_mask")
   .set_type_axes_names({"type"})
-  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("columns", {1, 4, 9})
   .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
-  .add_int64_axis("hits_%", {10, 50, 100});
+  .add_int64_axis("hits_%", {10, 20, 50, 80, 90, 100});
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index d7deebca89a..75d04bb4e8e 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -34,6 +34,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
   cudf::size_type const num_rows    = state.get_int64("NumRows");
   auto const keep                   = get_keep(state.get_string("keep"));
   cudf::size_type const cardinality = state.get_int64("cardinality");
+  auto const null_probability       = state.get_float64("null_probability");
 
   if (cardinality > num_rows) {
     state.skip("cardinality > num_rows");
@@ -42,7 +43,7 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   data_profile profile = data_profile_builder()
                            .cardinality(cardinality)
-                           .null_probability(0.01)
+                           .null_probability(null_probability)
                            .distribution(cudf::type_to_id<Type>(),
                                          distribution_id::UNIFORM,
                                          static_cast<Type>(0),
@@ -65,6 +66,7 @@ using data_type = nvbench::type_list<int32_t, int64_t>;
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
+  .add_float64_axis("null_probability", {0.01})
   .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
   .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index cd4d3ca964b..9750475a079 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -24,18 +24,14 @@
 
 void bench_case(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const encoding  = state.get_string("encoding");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
 
   auto col_view = column->view();
 
@@ -74,6 +70,7 @@ void bench_case(nvbench::state& state)
 
 NVBENCH_BENCH(bench_case)
   .set_name("case")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/benchmarks/string/char_types.cpp b/cpp/benchmarks/string/char_types.cpp
index eec9a5f54d7..abc5254392e 100644
--- a/cpp/benchmarks/string/char_types.cpp
+++ b/cpp/benchmarks/string/char_types.cpp
@@ -25,16 +25,12 @@
 static void bench_char_types(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const api_type  = state.get_string("api");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_char_types(nvbench::state& state)
 
 NVBENCH_BENCH(bench_char_types)
   .set_name("char_types")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("api", {"all", "filter"});
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index a73017dda18..e3940cbc0c7 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -29,17 +29,12 @@ std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"
 
 static void bench_contains(nvbench::state& state)
 {
-  auto const n_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
   auto const hit_rate      = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto col   = create_string_column(n_rows, row_width, hit_rate);
+  auto col   = create_string_column(num_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
   auto pattern = patterns[pattern_index];
@@ -56,7 +51,7 @@ static void bench_contains(nvbench::state& state)
 
 NVBENCH_BENCH(bench_contains)
   .set_name("contains")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
   .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/copy_if_else.cpp b/cpp/benchmarks/string/copy_if_else.cpp
index e06cca497c2..5a5743dfddf 100644
--- a/cpp/benchmarks/string/copy_if_else.cpp
+++ b/cpp/benchmarks/string/copy_if_else.cpp
@@ -25,15 +25,11 @@
 static void bench_copy(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const str_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const source_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, str_profile);
   auto const target_table =
@@ -58,5 +54,6 @@ static void bench_copy(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy)
   .set_name("copy_if_else")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/copy_range.cpp b/cpp/benchmarks/string/copy_range.cpp
index af217a49195..7e7353a0e78 100644
--- a/cpp/benchmarks/string/copy_range.cpp
+++ b/cpp/benchmarks/string/copy_range.cpp
@@ -25,16 +25,12 @@
 static void bench_copy_range(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const source_tables = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
@@ -56,5 +52,6 @@ static void bench_copy_range(nvbench::state& state)
 
 NVBENCH_BENCH(bench_copy_range)
   .set_name("copy_range")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index f964bc5d224..cf90e316f71 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -30,16 +30,12 @@ static std::string patterns[] = {"\\d+", "a"};
 static void bench_count(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -61,6 +57,7 @@ static void bench_count(nvbench::state& state)
 
 NVBENCH_BENCH(bench_count)
   .set_name("count")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/benchmarks/string/extract.cpp b/cpp/benchmarks/string/extract.cpp
index af4fedb5799..d6866598ff4 100644
--- a/cpp/benchmarks/string/extract.cpp
+++ b/cpp/benchmarks/string/extract.cpp
@@ -32,11 +32,6 @@ static void bench_extract(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto groups = static_cast<cudf::size_type>(state.get_int64("groups"));
 
   std::default_random_engine generator;
@@ -79,6 +74,6 @@ static void bench_extract(nvbench::state& state)
 
 NVBENCH_BENCH(bench_extract)
   .set_name("extract")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("groups", {1, 2, 4});
diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
index 6dcf731ad3c..27652193b7b 100644
--- a/cpp/benchmarks/string/join_strings.cpp
+++ b/cpp/benchmarks/string/join_strings.cpp
@@ -25,15 +25,11 @@
 static void bench_join(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -54,5 +50,6 @@ static void bench_join(nvbench::state& state)
 
 NVBENCH_BENCH(bench_join)
   .set_name("strings_join")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/lengths.cpp b/cpp/benchmarks/string/lengths.cpp
index a19060ead3b..8156e19412b 100644
--- a/cpp/benchmarks/string/lengths.cpp
+++ b/cpp/benchmarks/string/lengths.cpp
@@ -25,15 +25,11 @@
 static void bench_lengths(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_lengths(nvbench::state& state)
 
 NVBENCH_BENCH(bench_lengths)
   .set_name("lengths")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index 105ae65cbe8..f6410aaef30 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -30,11 +30,6 @@ static void bench_like(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hit_rate  = static_cast<int32_t>(state.get_int64("hit_rate"));
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   auto col   = create_string_column(n_rows, row_width, hit_rate);
   auto input = cudf::strings_column_view(col->view());
 
@@ -54,6 +49,6 @@ static void bench_like(nvbench::state& state)
 
 NVBENCH_BENCH(bench_like)
   .set_name("strings_like")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_int64_axis("hit_rate", {10, 25, 70, 100});
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index 4dcf1314f83..69426a2d484 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -26,18 +26,14 @@
 
 static void bench_replace(nvbench::state& state)
 {
-  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const rtype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
   auto program = cudf::strings::regex_program::create("(\\d+)");
@@ -62,6 +58,7 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512})
-  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"replace", "backref"});
diff --git a/cpp/benchmarks/string/reverse.cpp b/cpp/benchmarks/string/reverse.cpp
index a2676609a40..e2e914cb350 100644
--- a/cpp/benchmarks/string/reverse.cpp
+++ b/cpp/benchmarks/string/reverse.cpp
@@ -25,15 +25,11 @@
 static void bench_reverse(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const table_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
@@ -51,5 +47,6 @@ static void bench_reverse(nvbench::state& state)
 
 NVBENCH_BENCH(bench_reverse)
   .set_name("reverse")
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 1898f0340b6..c828a8ed0b0 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -36,11 +36,6 @@ static void bench_slice(nvbench::state& state)
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
@@ -76,6 +71,6 @@ static void bench_slice(nvbench::state& state)
 
 NVBENCH_BENCH(bench_slice)
   .set_name("slice")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {262144, 2097152, 16777216})
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"position", "multi"});
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 9ef58daf0fc..9c7c27c4f07 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -28,16 +28,12 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const stype     = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
   cudf::string_scalar target("+");
@@ -66,6 +62,7 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"split", "split_ws", "record", "record_ws"});
diff --git a/cpp/benchmarks/string/split_re.cpp b/cpp/benchmarks/string/split_re.cpp
index 1fdb6e67109..34a7aa96e84 100644
--- a/cpp/benchmarks/string/split_re.cpp
+++ b/cpp/benchmarks/string/split_re.cpp
@@ -28,17 +28,13 @@
 static void bench_split(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   auto prog = cudf::strings::regex_program::create("\\d+");
 
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -56,5 +52,6 @@ static void bench_split(nvbench::state& state)
 
 NVBENCH_BENCH(bench_split)
   .set_name("split_re")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/string/string_bench_args.hpp b/cpp/benchmarks/string/string_bench_args.hpp
deleted file mode 100644
index a34026281e8..00000000000
--- a/cpp/benchmarks/string/string_bench_args.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-
-#include <benchmark/benchmark.h>
-
-#include <limits>
-
-/**
- * @brief Generate row count and row length argument ranges for a string benchmark.
- *
- * Generates a series of row count and row length arguments for string benchmarks.
- * Combinations of row count and row length that would exceed the maximum string character
- * column data length are not generated.
- *
- * @param b           Benchmark to update with row count and row length arguments.
- * @param min_rows    Minimum row count argument to generate.
- * @param max_rows    Maximum row count argument to generate.
- * @param rows_mult   Row count multiplier to generate intermediate row count arguments.
- * @param min_rowlen  Minimum row length argument to generate.
- * @param max_rowlen  Maximum row length argument to generate.
- * @param rowlen_mult Row length multiplier to generate intermediate row length arguments.
- */
-inline void generate_string_bench_args(benchmark::internal::Benchmark* b,
-                                       int min_rows,
-                                       int max_rows,
-                                       int rows_mult,
-                                       int min_rowlen,
-                                       int max_rowlen,
-                                       int rowlen_mult)
-{
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= rows_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= rowlen_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
-  }
-}
diff --git a/cpp/benchmarks/text/edit_distance.cpp b/cpp/benchmarks/text/edit_distance.cpp
index 6ffa90edb8f..0ad1ae30f8c 100644
--- a/cpp/benchmarks/text/edit_distance.cpp
+++ b/cpp/benchmarks/text/edit_distance.cpp
@@ -27,15 +27,11 @@
 static void bench_edit_distance(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
-
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input1(strings_table->view().column(0));
@@ -55,5 +51,6 @@ static void bench_edit_distance(nvbench::state& state)
 
 NVBENCH_BENCH(bench_edit_distance)
   .set_name("edit_distance")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {8, 16, 32, 64, 128, 256});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144});
diff --git a/cpp/benchmarks/text/hash_ngrams.cpp b/cpp/benchmarks/text/hash_ngrams.cpp
index 4e5daf83a3c..7577cf00c0f 100644
--- a/cpp/benchmarks/text/hash_ngrams.cpp
+++ b/cpp/benchmarks/text/hash_ngrams.cpp
@@ -27,16 +27,12 @@
 static void bench_hash_ngrams(nvbench::state& state)
 {
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const ngrams    = static_cast<cudf::size_type>(state.get_int64("ngrams"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
@@ -55,6 +51,7 @@ static void bench_hash_ngrams(nvbench::state& state)
 
 NVBENCH_BENCH(bench_hash_ngrams)
   .set_name("hash_ngrams")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {16384, 32768, 262144})
   .add_int64_axis("ngrams", {5, 10});
diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d5b74da6773..5506501138b 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -28,17 +28,13 @@
 static void bench_jaccard(nvbench::state& state)
 {
   auto const num_rows        = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width       = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width       = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width       = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const substring_width = static_cast<cudf::size_type>(state.get_int64("substring_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const strings_profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const input_table = create_random_table(
     {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
@@ -59,6 +55,7 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {128, 512, 1024, 2048})
   .add_int64_axis("num_rows", {32768, 131072, 262144})
-  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index a80d0dcbdb8..8c86e8d4366 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -54,9 +54,8 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64
-                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
-                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
+    auto result = base64 ? nvtext::minhash64(input, 0, parameters_a, parameters_b, hash_width)
+                         : nvtext::minhash(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 71bccd80d39..594dc0de28a 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -28,16 +28,12 @@
 static void bench_normalize(nvbench::state& state)
 {
   auto const num_rows       = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width      = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width      = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width      = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const normalize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+    cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
@@ -60,6 +56,7 @@ static void bench_normalize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/benchmarks/text/replace.cpp b/cpp/benchmarks/text/replace.cpp
index 767ebab3eee..24ca4e5dfd7 100644
--- a/cpp/benchmarks/text/replace.cpp
+++ b/cpp/benchmarks/text/replace.cpp
@@ -31,11 +31,6 @@ static void bench_replace(nvbench::state& state)
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   std::vector<std::string> words{" ",        "one  ",    "two ",       "three ",     "four ",
                                  "five ",    "six  ",    "sevén  ",    "eight ",     "nine ",
                                  "ten   ",   "eleven ",  "twelve ",    "thirteen  ", "fourteen ",
@@ -71,5 +66,5 @@ static void bench_replace(nvbench::state& state)
 
 NVBENCH_BENCH(bench_replace)
   .set_name("replace")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("row_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index dd8df695d3e..0b4e3bdefa5 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,9 +14,6 @@
  * limitations under the License.
  */
 
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
-
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/file_utilities.hpp>
 
@@ -24,6 +21,8 @@
 
 #include <nvtext/subword_tokenize.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <filesystem>
 #include <fstream>
 #include <iostream>
@@ -54,40 +53,33 @@ static std::string create_hash_vocab_file()
   return hash_file;
 }
 
-static void BM_subword_tokenizer(benchmark::State& state)
+static void bench_subword_tokenizer(nvbench::state& state)
 {
-  auto const nrows = static_cast<cudf::size_type>(state.range(0));
-  std::vector<char const*> h_strings(nrows, "This is a test ");
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+
+  std::vector<char const*> h_strings(num_rows, "This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
-  uint32_t max_sequence_length = 64;
-  uint32_t stride              = 48;
-  uint32_t do_truncate         = 0;
-  uint32_t do_lower            = 1;
-  //
-  auto vocab = nvtext::load_vocabulary_file(hash_file);
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = nvtext::subword_tokenize(cudf::strings_column_view{strings},
-                                           *vocab,
-                                           max_sequence_length,
-                                           stride,
-                                           do_lower,
-                                           do_truncate);
-  }
-}
+  uint32_t max_sequence = 64;
+  uint32_t stride       = 48;
+  uint32_t do_truncate  = 0;
+  uint32_t do_lower     = 1;
 
-class Subword : public cudf::benchmark {};
+  auto input = cudf::strings_column_view{strings};
 
-#define SUBWORD_BM_BENCHMARK_DEFINE(name)                                                        \
-  BENCHMARK_DEFINE_F(Subword, name)(::benchmark::State & state) { BM_subword_tokenizer(state); } \
-  BENCHMARK_REGISTER_F(Subword, name)                                                            \
-    ->RangeMultiplier(2)                                                                         \
-    ->Range(1 << 10, 1 << 17)                                                                    \
-    ->UseManualTime()                                                                            \
-    ->Unit(benchmark::kMillisecond);
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows * max_sequence);
 
-SUBWORD_BM_BENCHMARK_DEFINE(BM_subword_tokenizer);
+  auto vocab = nvtext::load_vocabulary_file(hash_file);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result =
+      nvtext::subword_tokenize(input, *vocab, max_sequence, stride, do_lower, do_truncate);
+  });
+}
 
-// BENCHMARK_MAIN();
+NVBENCH_BENCH(bench_subword_tokenizer)
+  .set_name("subword_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index e83310e0343..b9590c5539f 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -31,17 +31,13 @@
 static void bench_tokenize(nvbench::state& state)
 {
   auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width     = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width     = static_cast<cudf::size_type>(state.get_int64("max_width"));
   auto const tokenize_type = state.get_string("type");
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
   data_profile const profile =
     data_profile_builder()
-      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width)
       .no_validity();
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
@@ -82,6 +78,7 @@ static void bench_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_tokenize)
   .set_name("tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"whitespace", "multi", "count", "count_multi", "ngrams", "characters"});
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 523d277df18..0502f375d99 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -33,16 +33,12 @@ static void bench_vocab_tokenize(nvbench::state& state)
 {
   auto const stream    = cudf::get_default_stream();
   auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const min_width = static_cast<cudf::size_type>(state.get_int64("min_width"));
+  auto const max_width = static_cast<cudf::size_type>(state.get_int64("max_width"));
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
-  }
-
-  auto const column = [num_rows, row_width] {
+  auto const column = [num_rows, min_width, max_width] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+      cudf::type_id::STRING, distribution_id::NORMAL, min_width, max_width);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
@@ -85,5 +81,6 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
 NVBENCH_BENCH(bench_vocab_tokenize)
   .set_name("vocab_tokenize")
-  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
-  .add_int64_axis("num_rows", {262144, 524288, 1048576, 2097152, 4194304, 16777216});
+  .add_int64_axis("min_width", {0})
+  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152});
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
deleted file mode 100644
index adc3dddc59c..00000000000
--- a/cpp/benchmarks/text/word_minhash.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmarks/common/generate_input.hpp>
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/filling.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <nvtext/minhash.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <nvbench/nvbench.cuh>
-
-static void bench_word_minhash(nvbench::state& state)
-{
-  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
-  auto const base64     = state.get_int64("hash_type") == 64;
-
-  data_profile const strings_profile =
-    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
-  auto strings_table =
-    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
-
-  auto const num_offsets = (num_rows / row_width) + 1;
-  auto offsets           = cudf::sequence(num_offsets,
-                                cudf::numeric_scalar<cudf::size_type>(0),
-                                cudf::numeric_scalar<cudf::size_type>(row_width));
-
-  auto source = cudf::make_lists_column(num_offsets - 1,
-                                        std::move(offsets),
-                                        std::move(strings_table->release().front()),
-                                        0,
-                                        rmm::device_buffer{});
-
-  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
-
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-
-  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
-  auto chars_size = input.chars_size(cudf::get_default_stream());
-  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
-  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
-
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
-                         : nvtext::word_minhash(source->view(), seeds.view());
-  });
-}
-
-NVBENCH_BENCH(bench_word_minhash)
-  .set_name("word_minhash")
-  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
-  .add_int64_axis("row_width", {10, 100, 1000})
-  .add_int64_axis("seed_count", {2, 25})
-  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index c440643037b..b0c48e04710 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,11 +14,6 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
-
   if(NOT BUILD_SHARED_LIBS)
     set(_exclude_from_all EXCLUDE_FROM_ALL FALSE)
   else()
@@ -31,6 +26,9 @@ function(find_and_configure_nanoarrow)
     nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
+    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf" ${_exclude_from_all}
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/get_spdlog.cmake b/cpp/cmake/thirdparty/get_spdlog.cmake
deleted file mode 100644
index 90b0f4d8a8e..00000000000
--- a/cpp/cmake/thirdparty/get_spdlog.cmake
+++ /dev/null
@@ -1,27 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-# Use CPM to find or clone speedlog
-function(find_and_configure_spdlog)
-
-  include(${rapids-cmake-dir}/cpm/spdlog.cmake)
-  rapids_cpm_spdlog(
-    FMT_OPTION "EXTERNAL_FMT_HO"
-    INSTALL_EXPORT_SET cudf-exports
-    BUILD_EXPORT_SET cudf-exports
-  )
-
-endfunction()
-
-find_and_configure_spdlog()
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 2f29578f7ae..d5cadce40c2 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff b/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
deleted file mode 100644
index e9a36fcb567..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_clang_tidy_compliance.diff
+++ /dev/null
@@ -1,38 +0,0 @@
-diff --git a/src/nanoarrow/common/inline_buffer.h b/src/nanoarrow/common/inline_buffer.h
-index caa6be4..70ec8a2 100644
---- a/src/nanoarrow/common/inline_buffer.h
-+++ b/src/nanoarrow/common/inline_buffer.h
-@@ -347,7 +347,7 @@ static inline void _ArrowBitsUnpackInt32(const uint8_t word, int32_t* out) {
- }
- 
- static inline void _ArrowBitmapPackInt8(const int8_t* values, uint8_t* out) {
--  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) |
-+  *out = (uint8_t)(values[0] | ((values[1] + 0x1) & 0x2) | ((values[2] + 0x3) & 0x4) | // NOLINT
-                    ((values[3] + 0x7) & 0x8) | ((values[4] + 0xf) & 0x10) |
-                    ((values[5] + 0x1f) & 0x20) | ((values[6] + 0x3f) & 0x40) |
-                    ((values[7] + 0x7f) & 0x80));
-@@ -471,13 +471,13 @@ static inline void ArrowBitsSetTo(uint8_t* bits, int64_t start_offset, int64_t l
-     // set bits within a single byte
-     const uint8_t only_byte_mask =
-         i_end % 8 == 0 ? first_byte_mask : (uint8_t)(first_byte_mask | last_byte_mask);
--    bits[bytes_begin] &= only_byte_mask;
-+    bits[bytes_begin] &= only_byte_mask;  // NOLINT
-     bits[bytes_begin] |= (uint8_t)(fill_byte & ~only_byte_mask);
-     return;
-   }
- 
-   // set/clear trailing bits of first byte
--  bits[bytes_begin] &= first_byte_mask;
-+  bits[bytes_begin] &= first_byte_mask;  // NOLINT
-   bits[bytes_begin] |= (uint8_t)(fill_byte & ~first_byte_mask);
- 
-   if (bytes_end - bytes_begin > 2) {
-@@ -637,7 +637,7 @@ static inline void ArrowBitmapAppendInt8Unsafe(struct ArrowBitmap* bitmap,
-   n_remaining -= n_full_bytes * 8;
-   if (n_remaining > 0) {
-     // Zero out the last byte
--    *out_cursor = 0x00;
-+    *out_cursor = 0x00;  // NOLINT
-     for (int i = 0; i < n_remaining; i++) {
-       ArrowBitSetTo(bitmap->buffer.data, out_i_cursor++, values_cursor[i]);
-     }
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
deleted file mode 100644
index d529787e7c8..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json
+++ /dev/null
@@ -1,18 +0,0 @@
-
-{
-  "packages" : {
-    "nanoarrow" : {
-      "version" : "0.6.0.dev",
-      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
-      "git_tag" : "1e2664a70ec14907409cadcceb14d79b9670bcdb",
-      "git_shallow" : false,
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nanoarrow_clang_tidy_compliance.diff",
-          "issue" : "https://github.com/apache/arrow-nanoarrow/issues/537",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
deleted file mode 100644
index 9f68d85e7db..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 3d004aa55..71ce86bea 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -63,7 +63,7 @@
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count1)                \
-   _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count2)
- 
--#if defined(THRUST_FORCE_64_BIT_OFFSET_TYPE)
-+#if 0
- //! @brief Always dispatches to 64 bit offset version of an algorithm
- #  define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)               \
-@@ -89,7 +89,7 @@
-     _THRUST_INDEX_TYPE_DISPATCH_GUARD_UNDERFLOW(count)                                     \
-     _THRUST_INDEX_TYPE_DISPATCH(std::uint64_t, status, call_64, count, arguments)
- 
--#elif defined(THRUST_FORCE_32_BIT_OFFSET_TYPE)
-+#elif 1
- 
- //! @brief Ensures that the size of the input does not overflow the offset type
- #  define _THRUST_INDEX_TYPE_DISPATCH_GUARD_OVERFLOW(index_type, count)                       \
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 1c1052487f2..5032a073b58 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1082,15 +1082,15 @@ initialization. If this setting is higher than the compile-time CMake variable,
 in between the two settings will be excluded from the written log. The available levels are the same
 as for the CMake variable.
 * Global logger object exposed via `cudf::logger()` - sets the minimum logging level at runtime.
-For example, calling `cudf::logger().set_level(spdlog::level::err)`, will exclude any messages that
+For example, calling `cudf::default_logger().set_level(level_enum::err)`, will exclude any messages that
 are not errors or critical errors. This API should not be used within libcudf to manipulate logging,
 its purpose is to allow upstream users to configure libcudf logging to fit their application.
 
 By default, logging messages are output to stderr.
 Setting the environment variable `LIBCUDF_DEBUG_LOG_FILE` redirects the log to a file with the
 specified path (can be relative to the current directory).
-Upstream users can also manipulate `cudf::logger().sinks()` to add sinks or divert the log to
-standard output or even a custom spdlog sink.
+Upstream users can also manipulate `cudf::default_logger().sinks()` to add sinks or divert the log to
+standard output.
 
 # Data Types
 
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index f5f514d26d9..a1b7db5e08a 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -110,8 +110,9 @@ class aggregation {
     COLLECT_SET,     ///< collect values into a list without duplicate entries
     LEAD,            ///< window function, accesses row at specified offset following current row
     LAG,             ///< window function, accesses row at specified offset preceding current row
-    PTX,             ///< PTX  UDF based reduction
-    CUDA,            ///< CUDA UDF based reduction
+    PTX,             ///< PTX  based UDF aggregation
+    CUDA,            ///< CUDA based UDF aggregation
+    HOST_UDF,        ///< host based UDF aggregation
     MERGE_LISTS,     ///< merge multiple lists values into one list
     MERGE_SETS,      ///< merge multiple lists values into one list then drop duplicate entries
     MERGE_M2,        ///< merge partial values of M2 aggregation,
@@ -120,7 +121,7 @@ class aggregation {
     TDIGEST,         ///< create a tdigest from a set of input values
     MERGE_TDIGEST,   ///< create a tdigest by merging multiple tdigests together
     HISTOGRAM,       ///< compute frequency of each element
-    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation,
+    MERGE_HISTOGRAM  ///< merge partial values of HISTOGRAM aggregation
   };
 
   aggregation() = delete;
@@ -599,6 +600,18 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                            std::string const& user_defined_aggregator,
                                            data_type output_type);
 
+// Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation.
+struct host_udf_base;
+
+/**
+ * @brief Factory to create a HOST_UDF aggregation.
+ *
+ * @param host_udf An instance of a class derived from `host_udf_base` to perform aggregation
+ * @return A HOST_UDF aggregation object
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> host_udf);
+
 /**
  * @brief Factory to create a MERGE_LISTS aggregation.
  *
diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp
new file mode 100644
index 00000000000..bbce76dc5f3
--- /dev/null
+++ b/cpp/include/cudf/aggregation/host_udf.hpp
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+#include <variant>
+
+/**
+ * @file host_udf.hpp
+ * @brief Declare the base class for host-side user-defined function (`HOST_UDF`) and example of
+ * subclass implementation.
+ */
+
+namespace CUDF_EXPORT cudf {
+/**
+ * @addtogroup aggregation_factories
+ * @{
+ */
+
+/**
+ * @brief The interface for host-based UDF implementation.
+ *
+ * An implementation of host-based UDF needs to be derived from this base class, defining
+ * its own version of the required functions. In particular:
+ *  - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`,
+ *    and `clone` functions.
+ *  - If necessary, the derived class can also override `do_hash` to compute hashing for its
+ *    instance, and `get_required_data` to selectively access to the input data as well as
+ *    intermediate data provided by libcudf.
+ *
+ * Example of such implementation:
+ * @code{.cpp}
+ * struct my_udf_aggregation : cudf::host_udf_base {
+ *   my_udf_aggregation() = default;
+ *
+ *   // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`,
+ *   // and the result from groupby `MAX` aggregation.
+ *   [[nodiscard]] data_attribute_set_t get_required_data() const override
+ *   {
+ *       return {groupby_data_attribute::GROUPED_VALUES,
+ *               groupby_data_attribute::GROUP_OFFSETS,
+ *               cudf::make_max_aggregation<cudf::groupby_aggregation>()};
+ *   }
+ *
+ *   [[nodiscard]] output_t get_empty_output(
+ *     [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+ *     [[maybe_unused]] rmm::cuda_stream_view stream,
+ *     [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // This UDF aggregation always returns a column of type INT32.
+ *     return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+ *   }
+ *
+ *   [[nodiscard]] output_t operator()(input_map_t const& input,
+ *                                     rmm::cuda_stream_view stream,
+ *                                     rmm::device_async_resource_ref mr) const override
+ *   {
+ *     // Perform UDF computation using the input data and return the result.
+ *   }
+ *
+ *   [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+ *   {
+ *     // Check if the other object is also instance of this class.
+ *     return dynamic_cast<my_udf_aggregation const*>(&other) != nullptr;
+ *   }
+ *
+ *   [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+ *   {
+ *     return std::make_unique<my_udf_aggregation>();
+ *   }
+ * };
+ * @endcode
+ */
+struct host_udf_base {
+  host_udf_base()          = default;
+  virtual ~host_udf_base() = default;
+
+  /**
+   * @brief Define the possible data needed for groupby aggregations.
+   *
+   * Note that only sort-based groupby aggregations are supported.
+   */
+  enum class groupby_data_attribute : int32_t {
+    INPUT_VALUES,    ///< The input values column.
+    GROUPED_VALUES,  ///< The input values grouped according to the input `keys` for which the
+                     ///< values within each group maintain their original order.
+    SORTED_GROUPED_VALUES,  ///< The input values grouped according to the input `keys` and
+                            ///< sorted within each group.
+    NUM_GROUPS,             ///< The number of groups (i.e., number of distinct keys).
+    GROUP_OFFSETS,          ///< The offsets separating groups.
+    GROUP_LABELS            ///< Group labels (which is also the same as group indices).
+  };
+
+  /**
+   * @brief Describe possible data that may be needed in the derived class for its operations.
+   *
+   * Such data can be either intermediate data such as sorted values or group labels etc, or the
+   * results of other aggregations.
+   *
+   * Each derived host-based UDF class may need a different set of data. It is inefficient to
+   * evaluate and pass down all these possible data at once from libcudf. A solution for that is,
+   * the derived class can define a subset of data that it needs and libcudf will evaluate
+   * and pass down only data requested from that set.
+   */
+  struct data_attribute {
+    /**
+     * @brief Hold all possible data types for the input of the aggregation in the derived class.
+     */
+    using value_type = std::variant<groupby_data_attribute, std::unique_ptr<aggregation>>;
+    value_type value;  ///< The actual data attribute, wrapped by this struct
+                       ///< as a wrapper is needed to define `hash` and `equal_to` functors.
+
+    data_attribute()                 = default;  ///< Default constructor
+    data_attribute(data_attribute&&) = default;  ///< Move constructor
+
+    /**
+     * @brief Construct a new data attribute from an aggregation attribute.
+     * @param value_ An aggregation attribute
+     */
+    template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, groupby_data_attribute>)>
+    data_attribute(T value_) : value{value_}
+    {
+    }
+
+    /**
+     * @brief Construct a new data attribute from another aggregation request.
+     * @param value_ An aggregation request
+     */
+    template <typename T,
+              CUDF_ENABLE_IF(std::is_same_v<T, aggregation> ||
+                             std::is_same_v<T, groupby_aggregation>)>
+    data_attribute(std::unique_ptr<T> value_) : value{std::move(value_)}
+    {
+      CUDF_EXPECTS(std::get<std::unique_ptr<aggregation>>(value) != nullptr,
+                   "Invalid aggregation request.");
+      if constexpr (std::is_same_v<T, aggregation>) {
+        CUDF_EXPECTS(
+          dynamic_cast<groupby_aggregation*>(std::get<std::unique_ptr<T>>(value).get()) != nullptr,
+          "Requesting results from other aggregations is only supported in groupby "
+          "aggregations.");
+      }
+    }
+
+    /**
+     * @brief Copy constructor.
+     * @param other The other data attribute to copy from
+     */
+    data_attribute(data_attribute const& other);
+
+    /**
+     * @brief Hash functor for `data_attribute`.
+     */
+    struct hash {
+      /**
+       * @brief Compute the hash value of a data attribute.
+       * @param attr The data attribute to hash
+       * @return The hash value of the data attribute
+       */
+      std::size_t operator()(data_attribute const& attr) const;
+    };  // struct hash
+
+    /**
+     * @brief Equality comparison functor for `data_attribute`.
+     */
+    struct equal_to {
+      /**
+       * @brief Check if two data attributes are equal.
+       * @param lhs The left-hand side data attribute
+       * @param rhs The right-hand side data attribute
+       * @return True if the two data attributes are equal
+       */
+      bool operator()(data_attribute const& lhs, data_attribute const& rhs) const;
+    };  // struct equal_to
+  };    // struct data_attribute
+
+  /**
+   * @brief Set of attributes for the input data that is needed for computing the aggregation.
+   */
+  using data_attribute_set_t =
+    std::unordered_set<data_attribute, data_attribute::hash, data_attribute::equal_to>;
+
+  /**
+   * @brief Return a set of attributes for the data that is needed for computing the aggregation.
+   *
+   * The derived class should return the attributes corresponding to only the data that it needs to
+   * avoid unnecessary computation performed in libcudf. If this function is not overridden, an
+   * empty set is returned. That means all the data attributes (except results from other
+   * aggregations in groupby) will be needed.
+   *
+   * @return A set of `data_attribute`
+   */
+  [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; }
+
+  /**
+   * @brief Hold all possible types of the data that is passed to the derived class for executing
+   * the aggregation.
+   */
+  using input_data_t = std::variant<column_view, size_type, device_span<size_type const>>;
+
+  /**
+   * @brief Input to the aggregation, mapping from each data attribute to its actual data.
+   */
+  using input_map_t = std::
+    unordered_map<data_attribute, input_data_t, data_attribute::hash, data_attribute::equal_to>;
+
+  /**
+   * @brief Output type of the aggregation.
+   *
+   * Currently only a single type is supported as the output of the aggregation, but it will hold
+   * more type in the future when reduction is supported.
+   */
+  using output_t = std::variant<std::unique_ptr<column>>;
+
+  /**
+   * @brief Get the output when the input values column is empty.
+   *
+   * This is called in libcudf when the input values column is empty. In such situations libcudf
+   * tries to generate the output directly without unnecessarily evaluating the intermediate data.
+   *
+   * @param output_dtype The expected output data type
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation when input values is empty
+   */
+  [[nodiscard]] virtual output_t get_empty_output(std::optional<data_type> output_dtype,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr) const = 0;
+
+  /**
+   * @brief Perform the main computation for the host-based UDF.
+   *
+   * @param input The input data needed for performing all computation
+   * @param stream The CUDA stream to use for any kernel launches
+   * @param mr Device memory resource to use for any allocations
+   * @return The output result of the aggregation
+   */
+  [[nodiscard]] virtual output_t operator()(input_map_t const& input,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr) const = 0;
+
+  /**
+   * @brief Computes hash value of the class's instance.
+   * @return The hash value of the instance
+   */
+  [[nodiscard]] virtual std::size_t do_hash() const
+  {
+    return std::hash<int>{}(static_cast<int>(aggregation::Kind::HOST_UDF));
+  }
+
+  /**
+   * @brief Compares two instances of the derived class for equality.
+   * @param other The other derived class's instance to compare with
+   * @return True if the two instances are equal
+   */
+  [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0;
+
+  /**
+   * @brief Clones the instance.
+   *
+   * A class derived from `host_udf_base` should not store too much data such that its instances
+   * remain lightweight for efficient cloning.
+   *
+   * @return A new instance cloned from this
+   */
+  [[nodiscard]] virtual std::unique_ptr<host_udf_base> clone() const = 0;
+};
+
+/** @} */  // end of group
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index db6d5255616..aacb5ccfede 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -33,11 +33,13 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
+#include <type_traits>
 
 /**
  * @file column_device_view.cuh
@@ -56,8 +58,8 @@ namespace CUDF_EXPORT cudf {
  *
  */
 struct nullate {
-  struct YES : std::bool_constant<true> {};
-  struct NO : std::bool_constant<false> {};
+  struct YES : cuda::std::bool_constant<true> {};
+  struct NO : cuda::std::bool_constant<false> {};
   /**
    * @brief `nullate::DYNAMIC` defers the determination of nullability to run time rather than
    * compile time. The calling code is responsible for specifying whether or not nulls are
@@ -80,7 +82,7 @@ struct nullate {
      * @return `true` if nulls are expected in the operation in which this object is applied,
      * otherwise false
      */
-    constexpr operator bool() const noexcept { return value; }
+    CUDF_HOST_DEVICE constexpr operator bool() const noexcept { return value; }
     bool value;  ///< True if nulls are expected
   };
 };
@@ -319,14 +321,14 @@ class alignas(16) column_device_view_base {
   }
 
   template <typename C, typename T, typename = void>
-  struct has_element_accessor_impl : std::false_type {};
+  struct has_element_accessor_impl : cuda::std::false_type {};
 
   template <typename C, typename T>
   struct has_element_accessor_impl<
     C,
     T,
-    void_t<decltype(std::declval<C>().template element<T>(std::declval<size_type>()))>>
-    : std::true_type {};
+    void_t<decltype(cuda::std::declval<C>().template element<T>(cuda::std::declval<size_type>()))>>
+    : cuda::std::true_type {};
 };
 // @cond
 // Forward declaration
@@ -442,7 +444,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return string_view instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
     char const* d_strings = static_cast<char const*>(_data);
@@ -501,7 +503,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return dictionary32 instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, dictionary32>)>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     size_type index    = element_index + offset();  // account for this view's _offset
     auto const indices = d_children[0];
@@ -519,7 +521,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return numeric::fixed_point representing the element at this index
    */
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T element(size_type element_index) const noexcept
   {
     using namespace numeric;
     using rep        = typename T::rep;
@@ -534,7 +536,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return `true` if `column_device_view::element<T>()` has a valid overload, `false` otherwise
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<column_device_view, T>::value;
   }
@@ -1032,7 +1034,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return Reference to the element at the specified index
    */
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  __device__ [[nodiscard]] T& element(size_type element_index) const noexcept
+  [[nodiscard]] __device__ T& element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }
@@ -1044,7 +1046,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return `true` if `mutable_column_device_view::element<T>()` has a valid overload, `false`
    */
   template <typename T>
-  static constexpr bool has_element_accessor()
+  CUDF_HOST_DEVICE static constexpr bool has_element_accessor()
   {
     return has_element_accessor_impl<mutable_column_device_view, T>::value;
   }
@@ -1425,13 +1427,13 @@ struct pair_rep_accessor {
 
  private:
   template <typename R, std::enable_if_t<std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
+  [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i);
   }
 
   template <typename R, std::enable_if_t<not std::is_same_v<R, rep_type>, void>* = nullptr>
-  __device__ [[nodiscard]] inline auto get_rep(cudf::size_type i) const
+  [[nodiscard]] __device__ inline auto get_rep(cudf::size_type i) const
   {
     return col.element<R>(i).value();
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.cuh b/cpp/include/cudf/detail/aggregation/aggregation.cuh
index de53e7586cd..59011f7b138 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.cuh
+++ b/cpp/include/cudf/detail/aggregation/aggregation.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,12 +32,11 @@
 #include <thrust/fill.h>
 
 #include <type_traits>
-#include <vector>
 
 namespace cudf {
 namespace detail {
 template <typename T>
-constexpr bool is_product_supported()
+CUDF_HOST_DEVICE constexpr bool is_product_supported()
 {
   return is_numeric<T>();
 }
@@ -216,12 +216,12 @@ struct identity_initializer {
  * @throw cudf::logic_error if column type is not fixed-width
  *
  * @param table The table of columns to initialize.
- * @param aggs A vector of aggregation operations corresponding to the table
+ * @param aggs A span of aggregation operations corresponding to the table
  * columns. The aggregations determine the identity value for each column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream);
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 6661a461b8b..d873e93bd20 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -20,6 +20,7 @@
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <functional>
@@ -88,6 +89,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class lead_lag_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class udf_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class host_udf_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class merge_lists_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
@@ -135,6 +138,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class collect_set_aggregation const& agg);
   virtual void visit(class lead_lag_aggregation const& agg);
   virtual void visit(class udf_aggregation const& agg);
+  virtual void visit(class host_udf_aggregation const& agg);
   virtual void visit(class merge_lists_aggregation const& agg);
   virtual void visit(class merge_sets_aggregation const& agg);
   virtual void visit(class merge_m2_aggregation const& agg);
@@ -960,6 +964,35 @@ class udf_aggregation final : public rolling_aggregation {
   }
 };
 
+/**
+ * @brief Derived class for specifying host-based UDF aggregation.
+ */
+class host_udf_aggregation final : public groupby_aggregation {
+ public:
+  std::unique_ptr<host_udf_base> udf_ptr;
+
+  host_udf_aggregation()                            = delete;
+  host_udf_aggregation(host_udf_aggregation const&) = delete;
+
+  // Need to define the constructor and destructor in a separate source file where we have the
+  // complete declaration of `host_udf_base`.
+  explicit host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_);
+  ~host_udf_aggregation() override;
+
+  [[nodiscard]] bool is_equal(aggregation const& _other) const override;
+
+  [[nodiscard]] size_t do_hash() const override;
+
+  [[nodiscard]] std::unique_ptr<aggregation> clone() const override;
+
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived aggregation class for specifying MERGE_LISTS aggregation
  */
@@ -1462,6 +1495,12 @@ struct target_type_impl<Source,
   using type = struct_view;
 };
 
+template <typename SourceType>
+struct target_type_impl<SourceType, aggregation::HOST_UDF> {
+  // Just a placeholder. The actual return type is unknown.
+  using type = struct_view;
+};
+
 /**
  * @brief Helper alias to get the accumulator type for performing aggregation
  * `k` on elements of type `Source`
@@ -1579,6 +1618,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::MERGE_TDIGEST>(std::forward<Ts>(args)...);
     case aggregation::EWMA:
       return f.template operator()<aggregation::EWMA>(std::forward<Ts>(args)...);
+    case aggregation::HOST_UDF:
+      return f.template operator()<aggregation::HOST_UDF>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4159e324472..9226697a7f6 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -16,300 +16,25 @@
 
 #pragma once
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
-#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/cuda.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <cub/cub.cuh>
-#include <cuda/atomic>
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <algorithm>
-
 namespace cudf {
 namespace detail {
 
-// Compute the count of elements that pass the mask within each block
-template <typename Filter, int block_size>
-CUDF_KERNEL void compute_block_counts(cudf::size_type* __restrict__ block_counts,
-                                      cudf::size_type size,
-                                      cudf::size_type per_thread,
-                                      Filter filter)
-{
-  int tid   = threadIdx.x + per_thread * block_size * blockIdx.x;
-  int count = 0;
-
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-    count += __syncthreads_count(mask_true);
-    tid += block_size;
-  }
-
-  if (threadIdx.x == 0) block_counts[blockIdx.x] = count;
-}
-
-// Compute the exclusive prefix sum of each thread's mask value within each block
-template <int block_size>
-__device__ cudf::size_type block_scan_mask(bool mask_true, cudf::size_type& block_sum)
-{
-  int offset = 0;
-
-  using BlockScan = cub::BlockScan<cudf::size_type, block_size>;
-  __shared__ typename BlockScan::TempStorage temp_storage;
-  BlockScan(temp_storage).ExclusiveSum(mask_true, offset, block_sum);
-
-  return offset;
-}
-
-// This kernel scatters data and validity mask of a column based on the
-// scan of the boolean mask. The block offsets for the scan are already computed.
-// Just compute the scan of the mask in each block and add it to the block's
-// output offset. This is the output index of each element. Scattering
-// the valid mask is not as easy, because each thread is only responsible for
-// one bit. Warp-level processing (ballot) makes this simpler.
-// To make scattering efficient, we "coalesce" the block's scattered data and
-// valids in shared memory, and then write from shared memory to global memory
-// in a contiguous manner.
-// The has_validity template parameter specializes this kernel for the
-// non-nullable case for performance without writing another kernel.
-//
-// Note: `filter` is not run on indices larger than the input column size
-template <typename T, typename Filter, int block_size, bool has_validity>
-__launch_bounds__(block_size) CUDF_KERNEL
-  void scatter_kernel(cudf::mutable_column_device_view output_view,
-                      cudf::size_type* output_null_count,
-                      cudf::column_device_view input_view,
-                      cudf::size_type const* __restrict__ block_offsets,
-                      cudf::size_type size,
-                      cudf::size_type per_thread,
-                      Filter filter)
-{
-  T* __restrict__ output_data                   = output_view.data<T>();
-  cudf::bitmask_type* __restrict__ output_valid = output_view.null_mask();
-  static_assert(block_size <= 1024, "Maximum thread block size exceeded");
-
-  int tid                      = threadIdx.x + per_thread * block_size * blockIdx.x;
-  cudf::size_type block_offset = block_offsets[blockIdx.x];
-
-  // one extra warp worth in case the block is not aligned
-  __shared__ bool temp_valids[has_validity ? block_size + cudf::detail::warp_size : 1];
-  __shared__ T temp_data[block_size];
-
-  cudf::size_type warp_valid_counts{0};  // total valid sum over the `per_thread` loop below
-  cudf::size_type block_sum = 0;         // count passing filter over the `per_thread` loop below
-
-  // Note that since the maximum gridDim.x on all supported GPUs is as big as
-  // cudf::size_type, this loop is sufficient to cover our maximum column size
-  // regardless of the value of block_size and per_thread.
-  for (int i = 0; i < per_thread; i++) {
-    bool mask_true = (tid < size) && filter(tid);
-
-    cudf::size_type tmp_block_sum = 0;
-    // get output location using a scan of the mask result
-    cudf::size_type const local_index = block_scan_mask<block_size>(mask_true, tmp_block_sum);
-    block_sum += tmp_block_sum;
-
-    if (has_validity) {
-      temp_valids[threadIdx.x] = false;  // init shared memory
-      if (threadIdx.x < cudf::detail::warp_size) temp_valids[block_size + threadIdx.x] = false;
-      __syncthreads();  // wait for init
-    }
-
-    if (mask_true) {
-      temp_data[local_index] = input_view.data<T>()[tid];  // scatter data to shared
-
-      // scatter validity mask to shared memory
-      if (has_validity and input_view.is_valid(tid)) {
-        // determine aligned offset for this warp's output
-        cudf::size_type const aligned_offset      = block_offset % cudf::detail::warp_size;
-        temp_valids[local_index + aligned_offset] = true;
-      }
-    }
-
-    __syncthreads();  // wait for shared data and validity mask to be complete
-
-    // Copy output data coalesced from shared to global
-    if (threadIdx.x < tmp_block_sum)
-      output_data[block_offset + threadIdx.x] = temp_data[threadIdx.x];
-
-    if (has_validity) {
-      // Since the valid bools are contiguous in shared memory now, we can use
-      // __popc to combine them into a single mask element.
-      // Then, most mask elements can be directly copied from shared to global
-      // memory. Only the first and last 32-bit mask elements of each block must
-      // use an atomicOr, because these are where other blocks may overlap.
-
-      constexpr int num_warps = block_size / cudf::detail::warp_size;
-      // account for partial blocks with non-warp-aligned offsets
-      int const last_index = tmp_block_sum + (block_offset % cudf::detail::warp_size) - 1;
-      int const last_warp  = min(num_warps, last_index / cudf::detail::warp_size);
-      int const wid        = threadIdx.x / cudf::detail::warp_size;
-      int const lane       = threadIdx.x % cudf::detail::warp_size;
-
-      cudf::size_type tmp_warp_valid_counts{0};
-
-      if (tmp_block_sum > 0 && wid <= last_warp) {
-        int valid_index = (block_offset / cudf::detail::warp_size) + wid;
-
-        // compute the valid mask for this warp
-        uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[threadIdx.x]);
-
-        // Note the atomicOr's below assume that output_valid has been set to
-        // all zero before the kernel
-        if (lane == 0 && valid_warp != 0) {
-          tmp_warp_valid_counts = __popc(valid_warp);
-          if (wid > 0 && wid < last_warp)
-            output_valid[valid_index] = valid_warp;
-          else {
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-
-        // if the block is full and not aligned then we have one more warp to cover
-        if ((wid == 0) && (last_warp == num_warps)) {
-          uint32_t valid_warp = __ballot_sync(0xffff'ffffu, temp_valids[block_size + threadIdx.x]);
-          if (lane == 0 && valid_warp != 0) {
-            tmp_warp_valid_counts += __popc(valid_warp);
-            cuda::atomic_ref<cudf::bitmask_type, cuda::thread_scope_device> ref{
-              output_valid[valid_index + num_warps]};
-            ref.fetch_or(valid_warp, cuda::std::memory_order_relaxed);
-          }
-        }
-      }
-      warp_valid_counts += tmp_warp_valid_counts;
-    }
-
-    block_offset += tmp_block_sum;
-    tid += block_size;
-  }
-  // Compute total null_count for this block and add it to global count
-  constexpr cudf::size_type leader_lane{0};
-  cudf::size_type block_valid_count =
-    cudf::detail::single_lane_block_sum_reduce<block_size, leader_lane>(warp_valid_counts);
-
-  if (threadIdx.x == 0) {  // one thread computes and adds to null count
-    cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{*output_null_count};
-    ref.fetch_add(block_sum - block_valid_count, cuda::std::memory_order_relaxed);
-  }
-}
-
-template <typename T, typename Enable = void>
-struct DeviceType {
-  using type = T;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_timestamp<T>()>> {
-  using type = typename T::rep;
-};
-
-template <typename T>
-struct DeviceType<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  using type = typename cudf::device_storage_type_t<T>;
-};
-
-// Dispatch functor which performs the scatter for fixed column types and gather for other
-template <typename Filter, int block_size>
-struct scatter_gather_functor {
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const* block_offsets,
-                                           Filter filter,
-                                           cudf::size_type per_thread,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    auto output_column =
-      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
-    auto output = output_column->mutable_view();
-
-    bool has_valid = input.nullable();
-
-    using Type = typename DeviceType<T>::type;
-
-    auto scatter = (has_valid) ? scatter_kernel<Type, Filter, block_size, true>
-                               : scatter_kernel<Type, Filter, block_size, false>;
-
-    cudf::detail::grid_1d grid{input.size(), block_size, per_thread};
-
-    cudf::detail::device_scalar<cudf::size_type> null_count{0, stream};
-    if (output.nullable()) {
-      // Have to initialize the output mask to all zeros because we may update
-      // it with atomicOr().
-      CUDF_CUDA_TRY(cudaMemsetAsync(static_cast<void*>(output.null_mask()),
-                                    0,
-                                    cudf::bitmask_allocation_size_bytes(output.size()),
-                                    stream.value()));
-    }
-
-    auto output_device_view = cudf::mutable_column_device_view::create(output, stream);
-    auto input_device_view  = cudf::column_device_view::create(input, stream);
-    scatter<<<grid.num_blocks, block_size, 0, stream.value()>>>(*output_device_view,
-                                                                null_count.data(),
-                                                                *input_device_view,
-                                                                block_offsets,
-                                                                input.size(),
-                                                                per_thread,
-                                                                filter);
-
-    if (has_valid) { output_column->set_null_count(null_count.value(stream)); }
-    return output_column;
-  }
-
-  template <typename T,
-            std::enable_if_t<!cudf::is_fixed_width<T>() and !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
-                                           cudf::size_type const& output_size,
-                                           cudf::size_type const*,
-                                           Filter filter,
-                                           cudf::size_type,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-  {
-    rmm::device_uvector<cudf::size_type> indices(output_size, stream);
-
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::counting_iterator<cudf::size_type>(0),
-                    thrust::counting_iterator<cudf::size_type>(input.size()),
-                    indices.begin(),
-                    filter);
-
-    auto output_table = cudf::detail::gather(cudf::table_view{{input}},
-                                             indices,
-                                             cudf::out_of_bounds_policy::DONT_CHECK,
-                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
-                                             stream,
-                                             mr);
-
-    // There will be only one column
-    return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
-  }
-};
-
 /**
  * @brief Filters `input` using a Filter function object
  *
@@ -319,9 +44,11 @@ struct scatter_gather_functor {
  * false otherwise.
  *
  * @tparam Filter the filter functor type
- * @param[in] input The table_view to filter
- * @param[in] filter A function object that takes an index and returns a bool
- * @return unique_ptr<table> The table generated from filtered `input`.
+ * @param input The table_view to filter
+ * @param filter A function object that takes an index and returns a bool
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for allocating the returned memory
+ * @return The table generated from filtered `input`
  */
 template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
@@ -333,76 +60,22 @@ std::unique_ptr<table> copy_if(table_view const& input,
 
   if (0 == input.num_rows() || 0 == input.num_columns()) { return empty_like(input); }
 
-  constexpr int block_size = 256;
-  cudf::size_type per_thread =
-    elements_per_thread(compute_block_counts<Filter, block_size>, input.num_rows(), block_size);
-  cudf::detail::grid_1d grid{input.num_rows(), block_size, per_thread};
-
-  // temp storage for block counts and offsets
-  rmm::device_uvector<cudf::size_type> block_counts(grid.num_blocks, stream);
-  rmm::device_uvector<cudf::size_type> block_offsets(grid.num_blocks + 1, stream);
-
-  // 1. Find the count of elements in each block that "pass" the mask
-  compute_block_counts<Filter, block_size><<<grid.num_blocks, block_size, 0, stream.value()>>>(
-    block_counts.begin(), input.num_rows(), per_thread, filter);
-
-  // initialize just the first element of block_offsets to 0 since the InclusiveSum below
-  // starts at the second element.
-  CUDF_CUDA_TRY(cudaMemsetAsync(block_offsets.begin(), 0, sizeof(cudf::size_type), stream.value()));
-
-  // 2. Find the offset for each block's output using a scan of block counts
-  if (grid.num_blocks > 1) {
-    // Determine and allocate temporary device storage
-    size_t temp_storage_bytes = 0;
-    cub::DeviceScan::InclusiveSum(nullptr,
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-    rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
-
-    // Run exclusive prefix sum
-    cub::DeviceScan::InclusiveSum(d_temp_storage.data(),
-                                  temp_storage_bytes,
-                                  block_counts.begin(),
-                                  block_offsets.begin() + 1,
-                                  grid.num_blocks,
-                                  stream.value());
-  }
-
-  // As it is InclusiveSum, last value in block_offsets will be output_size
-  // unless num_blocks == 1, in which case output_size is just block_counts[0]
-  cudf::size_type output_size{0};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &output_size,
-    grid.num_blocks > 1 ? block_offsets.begin() + grid.num_blocks : block_counts.begin(),
-    sizeof(cudf::size_type),
-    cudaMemcpyDefault,
-    stream.value()));
+  auto indices     = rmm::device_uvector<size_type>(input.num_rows(), stream);
+  auto const begin = thrust::counting_iterator<size_type>(0);
+  auto const end   = begin + input.num_rows();
+  auto const indices_end =
+    thrust::copy_if(rmm::exec_policy(stream), begin, end, indices.begin(), filter);
 
-  stream.synchronize();
+  auto const output_size = static_cast<size_type>(thrust::distance(indices.begin(), indices_end));
 
-  if (output_size == input.num_rows()) {
-    return std::make_unique<table>(input, stream, mr);
-  } else if (output_size > 0) {
-    std::vector<std::unique_ptr<column>> out_columns(input.num_columns());
-    std::transform(input.begin(), input.end(), out_columns.begin(), [&](auto col_view) {
-      return cudf::type_dispatcher(col_view.type(),
-                                   scatter_gather_functor<Filter, block_size>{},
-                                   col_view,
-                                   output_size,
-                                   block_offsets.begin(),
-                                   filter,
-                                   per_thread,
-                                   stream,
-                                   mr);
-    });
+  // nothing selected
+  if (output_size == 0) { return empty_like(input); }
+  // everything selected
+  if (output_size == input.num_rows()) { return std::make_unique<table>(input, stream, mr); }
 
-    return std::make_unique<table>(std::move(out_columns));
-  } else {
-    return empty_like(input);
-  }
+  auto const map = device_span<size_type const>(indices.data(), output_size);
+  return cudf::detail::gather(
+    input, map, out_of_bounds_policy::DONT_CHECK, negative_index_policy::NOT_ALLOWED, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 5dc75b1a3fb..a7efb4e6e93 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -44,10 +44,11 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
-  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
-  int const warp_id              = tidx / cudf::detail::warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
+  auto tidx = cudf::detail::grid_1d::global_thread_id<block_size>();
+
+  auto const stride         = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const warp_id        = tidx / cudf::detail::warp_size;
+  auto const warps_per_grid = stride / cudf::detail::warp_size;
 
   // begin/end indices for the column data
   size_type const begin = 0;
@@ -60,7 +61,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % cudf::detail::warp_size;
+  auto const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/include/cudf/detail/device_scalar.hpp b/cpp/include/cudf/detail/device_scalar.hpp
index 16ca06c6561..090dc8b62b6 100644
--- a/cpp/include/cudf/detail/device_scalar.hpp
+++ b/cpp/include/cudf/detail/device_scalar.hpp
@@ -78,7 +78,7 @@ class device_scalar : public rmm::device_scalar<T> {
   [[nodiscard]] T value(rmm::cuda_stream_view stream) const
   {
     cuda_memcpy<T>(bounce_buffer, device_span<T const>{this->data(), 1}, stream);
-    return bounce_buffer[0];
+    return std::move(bounce_buffer[0]);
   }
 
   void set_value_async(T const& value, rmm::cuda_stream_view stream)
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 2acc10105cf..9a10163eb15 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,19 +36,24 @@ using cudf::experimental::row::lhs_index_type;
 using cudf::experimental::row::rhs_index_type;
 
 /**
- * @brief An comparator adapter wrapping both self comparator and two table comparator
+ * @brief A custom comparator used for the build table insertion
  */
-template <typename Equal>
-struct comparator_adapter {
-  comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
-
-  __device__ constexpr auto operator()(
+struct always_not_equal {
+  __device__ constexpr bool operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
   {
     // All build table keys are distinct thus `false` no matter what
     return false;
   }
+};
+
+/**
+ * @brief An comparator adapter wrapping the two table comparator
+ */
+template <typename Equal>
+struct comparator_adapter {
+  comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const& lhs,
@@ -62,56 +67,14 @@ struct comparator_adapter {
   Equal _d_equal;
 };
 
-template <typename Hasher>
-struct hasher_adapter {
-  hasher_adapter(Hasher const& d_hasher = {}) : _d_hasher{d_hasher} {}
-
-  template <typename T>
-  __device__ constexpr auto operator()(cuco::pair<hash_value_type, T> const& key) const noexcept
-  {
-    return _d_hasher(key.first);
-  }
-
- private:
-  Hasher _d_hasher;
-};
-
 /**
  * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
  * `*_join` member functions.
  *
- * @tparam HasNested Flag indicating whether there are nested columns in build/probe table
+ * This class enables the distinct hash join scheme that builds hash table once, and probes as many
+ * times as needed (possibly in parallel).
  */
-template <cudf::has_nested HasNested>
-struct distinct_hash_join {
- private:
-  /// Device row equal type
-  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
-                                                             cudf::nullate::DYNAMIC>>;
-  using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
-  using probing_scheme_type = cuco::linear_probing<1, hasher>;
-  using cuco_storage_type   = cuco::storage<1>;
-
-  /// Hash table type
-  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
-                                           cuco::extent<size_type>,
-                                           cuda::thread_scope_device,
-                                           comparator_adapter<d_equal_type>,
-                                           probing_scheme_type,
-                                           cudf::detail::cuco_allocator<char>,
-                                           cuco_storage_type>;
-
-  bool _has_nulls;  ///< true if nulls are present in either build table or probe table
-  cudf::null_equality _nulls_equal;  ///< whether to consider nulls as equal
-  cudf::table_view _build;           ///< input table to build the hash map
-  cudf::table_view _probe;           ///< input table to probe the hash map
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_build;  ///< input table preprocssed for row operators
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_probe;        ///< input table preprocssed for row operators
-  hash_table_type _hash_table;  ///< hash table built on `_build`
-
+class distinct_hash_join {
  public:
   distinct_hash_join()                                     = delete;
   ~distinct_hash_join()                                    = default;
@@ -120,21 +83,28 @@ struct distinct_hash_join {
   distinct_hash_join& operator=(distinct_hash_join const&) = delete;
   distinct_hash_join& operator=(distinct_hash_join&&)      = delete;
 
+  /**
+   * @brief Hasher adapter used by distinct hash join
+   */
+  struct hasher {
+    template <typename T>
+    __device__ constexpr hash_value_type operator()(
+      cuco::pair<hash_value_type, T> const& key) const noexcept
+    {
+      return key.first;
+    }
+  };
+
   /**
    * @brief Constructor that internally builds the hash table based on the given `build` table.
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
    *
    * @param build The build table, from which the hash table is built
-   * @param probe The probe table
-   * @param has_nulls Flag to indicate if any nulls exist in the `build` table or
-   *        any `probe` table that will be used later for join.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   distinct_hash_join(cudf::table_view const& build,
-                     cudf::table_view const& probe,
-                     bool has_nulls,
                      cudf::null_equality compare_nulls,
                      rmm::cuda_stream_view stream);
 
@@ -143,12 +113,36 @@ struct distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
+  inner_join(cudf::table_view const& probe,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
+    cudf::table_view const& probe,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const;
+
+ private:
+  using probing_scheme_type = cuco::linear_probing<1, hasher>;
+  using cuco_storage_type   = cuco::storage<1>;
+
+  /// Hash table type
+  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
+                                           cuco::extent<size_type>,
+                                           cuda::thread_scope_device,
+                                           always_not_equal,
+                                           probing_scheme_type,
+                                           cudf::detail::cuco_allocator<char>,
+                                           cuco_storage_type>;
+
+  bool _has_nested_columns;  ///< True if nested columns are present in build and probe tables
+  cudf::null_equality _nulls_equal;  ///< Whether to consider nulls as equal
+  cudf::table_view _build;           ///< Input table to build the hash map
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
+    _preprocessed_build;        ///< Input table preprocssed for row operators
+  hash_table_type _hash_table;  ///< Hash table built on `_build`
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/get_value.cuh b/cpp/include/cudf/detail/get_value.cuh
index 5ea0d06039f..1bfb40e5916 100644
--- a/cpp/include/cudf/detail/get_value.cuh
+++ b/cpp/include/cudf/detail/get_value.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -48,11 +49,9 @@ T get_value(column_view const& col_view, size_type element_index, rmm::cuda_stre
   CUDF_EXPECTS(data_type(type_to_id<T>()) == col_view.type(), "get_value data type mismatch");
   CUDF_EXPECTS(element_index >= 0 && element_index < col_view.size(),
                "invalid element_index value");
-  T result;
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    &result, col_view.data<T>() + element_index, sizeof(T), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
-  return result;
+  return cudf::detail::make_host_vector_sync(
+           device_span<T const>{col_view.data<T>() + element_index, 1}, stream)
+    .front();
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 61a8e9f7ec3..72cdc3d8067 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -74,9 +74,10 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type global_thread_id(thread_index_type thread_id,
-                                                      thread_index_type block_id,
-                                                      thread_index_type num_threads_per_block)
+  __device__ static constexpr thread_index_type global_thread_id(
+    thread_index_type thread_id,
+    thread_index_type block_id,
+    thread_index_type num_threads_per_block)
   {
     return thread_id + block_id * num_threads_per_block;
   }
@@ -114,8 +115,8 @@ class grid_1d {
    * @param num_threads_per_block The number of threads per block
    * @return thread_index_type The global thread index
    */
-  static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
-                                                 thread_index_type num_blocks_per_grid)
+  __device__ static constexpr thread_index_type grid_stride(thread_index_type num_threads_per_block,
+                                                            thread_index_type num_blocks_per_grid)
   {
     return num_threads_per_block * num_blocks_per_grid;
   }
diff --git a/cpp/include/cudf/detail/utilities/device_operators.cuh b/cpp/include/cudf/detail/utilities/device_operators.cuh
index 46f424e051b..923cd04479d 100644
--- a/cpp/include/cudf/detail/utilities/device_operators.cuh
+++ b/cpp/include/cudf/detail/utilities/device_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/functional>
+
 #include <type_traits>
 
 namespace cudf {
@@ -42,7 +44,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto min(LHS const& lhs, RHS const& rhs)
 {
-  return std::min(lhs, rhs);
+  return cuda::std::min(lhs, rhs);
 }
 
 /**
@@ -53,7 +55,7 @@ template <typename LHS,
           std::enable_if_t<cudf::is_relationally_comparable<LHS, RHS>()>* = nullptr>
 CUDF_HOST_DEVICE inline auto max(LHS const& lhs, RHS const& rhs)
 {
-  return std::max(lhs, rhs);
+  return cuda::std::max(lhs, rhs);
 }
 }  // namespace detail
 
@@ -68,22 +70,26 @@ struct DeviceSum {
   }
 
   template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{typename T::duration{0}};
   }
 
   template <typename T,
             std::enable_if_t<!cudf::is_timestamp<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{0};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support device operator identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support device operator identity");
+#endif
     return T{};
   }
 };
@@ -105,7 +111,7 @@ struct DeviceCount {
   }
 
   template <typename T>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{};
   }
@@ -125,7 +131,7 @@ struct DeviceMin {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::max()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -139,9 +145,13 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMin identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMin identity");
+#endif
     return cuda::std::numeric_limits<T>::max();
   }
 
@@ -153,7 +163,7 @@ struct DeviceMin {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::max_value());
   }
@@ -173,7 +183,7 @@ struct DeviceMax {
   template <typename T,
             std::enable_if_t<!std::is_same_v<T, cudf::string_view> && !cudf::is_dictionary<T>() &&
                              !cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     // chrono types do not have std::numeric_limits specializations and should use T::min()
     // https://eel.is/c++draft/numeric.limits.general#6
@@ -187,9 +197,13 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceMax identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceMax identity");
+#endif
     return cuda::std::numeric_limits<T>::lowest();
   }
 
@@ -200,7 +214,7 @@ struct DeviceMax {
   }
 
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return static_cast<T>(T::lowest_value());
   }
@@ -217,15 +231,19 @@ struct DeviceProduct {
   }
 
   template <typename T, std::enable_if_t<!cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
     return T{1};
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  static constexpr T identity()
+  CUDF_HOST_DEVICE static constexpr T identity()
   {
+#ifndef __CUDA_ARCH__
     CUDF_FAIL("fixed_point does not yet support DeviceProduct identity");
+#else
+    CUDF_UNREACHABLE("fixed_point does not yet support DeviceProduct identity");
+#endif
     return T{1, numeric::scale_type{0}};
   }
 };
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 8b709f2a8f8..2e3d71815c0 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -1,7 +1,7 @@
 /*
  * Copyright 2019 BlazingDB, Inc.
  *     Copyright 2019 Eyal Rozenberg <eyalroz@blazingdb.com>
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -86,7 +86,7 @@ constexpr S round_down_safe(S number_to_round, S modulus) noexcept
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexcept
 {
   auto remainder = number_to_round % modulus;
   if (remainder == 0) { return number_to_round; }
@@ -134,16 +134,20 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
 }  // namespace detail
 
 /**
- * Divides the left-hand-side by the right-hand-side, rounding up
+ * @brief Divides the left-hand-side by the right-hand-side, rounding up
  * to an integral multiple of the right-hand-side, e.g. (9,5) -> 2 , (10,5) -> 2, (11,5) -> 3.
  *
- * @param dividend the number to divide
- * @param divisor the number of by which to divide
- * @return The least integer multiple of {@link divisor} which is greater than or equal to
- * the non-integral division dividend/divisor.
+ * The result is undefined if `divisor == 0` or
+ * if `divisor == -1` and `dividend == min<I>()`.
+ *
+ * Will not overflow, and may _or may not_ be slower than the intuitive
+ * approach of using `(dividend + divisor - 1) / divisor`.
  *
- * @note will not overflow, and may _or may not_ be slower than the intuitive
- * approach of using (dividend + divisor - 1) / divisor
+ * @tparam I Integer type for `dividend`, `divisor`, and the return type
+ * @param dividend The number to divide
+ * @param divisor The number by which to divide
+ * @return The least integer multiple of `divisor` which is greater than or equal to
+ * the non-integral division `dividend/divisor`
  */
 template <typename I>
 constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
@@ -183,7 +187,7 @@ constexpr bool is_a_power_of_two(I val) noexcept
  * @return Absolute value if value type is signed.
  */
 template <typename T>
-constexpr auto absolute_value(T value) -> T
+CUDF_HOST_DEVICE constexpr auto absolute_value(T value) -> T
 {
   if constexpr (cuda::std::is_signed<T>()) return numeric::detail::abs(value);
   return value;
diff --git a/cpp/include/cudf/detail/utilities/logger.hpp b/cpp/include/cudf/detail/utilities/logger.hpp
deleted file mode 100644
index e7643eb44bd..00000000000
--- a/cpp/include/cudf/detail/utilities/logger.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/logger.hpp>
-
-// Log messages that require computation should only be used at level TRACE and DEBUG
-#define CUDF_LOG_TRACE(...)    SPDLOG_LOGGER_TRACE(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_DEBUG(...)    SPDLOG_LOGGER_DEBUG(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_INFO(...)     SPDLOG_LOGGER_INFO(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_WARN(...)     SPDLOG_LOGGER_WARN(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_ERROR(...)    SPDLOG_LOGGER_ERROR(&cudf::detail::logger(), __VA_ARGS__)
-#define CUDF_LOG_CRITICAL(...) SPDLOG_LOGGER_CRITICAL(&cudf::detail::logger(), __VA_ARGS__)
diff --git a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
index fce08b4a5c4..9e68bafb09a 100644
--- a/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/detail/floating_conversion.hpp
@@ -22,6 +22,7 @@
 #include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
+#include <cuda/std/utility>
 
 #include <cstring>
 
@@ -183,7 +184,7 @@ struct floating_converter {
    * @param integer_rep The bit-casted floating value to extract the exponent from
    * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+  CUDF_HOST_DEVICE inline static cuda::std::pair<IntegralType, int> get_significand_and_pow2(
     IntegralType integer_rep)
   {
     // Extract the significand
@@ -1008,7 +1009,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
@@ -1075,7 +1076,7 @@ CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int
   }
 
   // Our shifting_rep is now the integer mantissa, return it and the powers of 2
-  return std::pair{shifting_rep, pow2};
+  return cuda::std::pair{shifting_rep, pow2};
 }
 
 /**
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index ea2f5d4b6ca..5edbb322231 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
  * @return `true` if the type is supported by `fixed_point` implementation
  */
 template <typename T>
-constexpr inline auto is_supported_representation_type()
+CUDF_HOST_DEVICE constexpr inline auto is_supported_representation_type()
 {
   return cuda::std::is_same_v<T, int32_t> ||  //
          cuda::std::is_same_v<T, int64_t> ||  //
@@ -72,6 +72,24 @@ constexpr inline auto is_supported_representation_type()
 // Helper functions for `fixed_point` type
 namespace detail {
 
+/**
+ * @brief Returns the smaller of the given scales
+ *
+ * @param a The left-hand side value to compare
+ * @param b The right-hand side value to compare
+ * @return The smaller of the given scales
+ */
+CUDF_HOST_DEVICE constexpr inline scale_type min(scale_type const& a, scale_type const& b)
+{
+  // TODO This is a temporary workaround because <cuda/std/functional> is not self-contained when
+  // built with NVRTC 11.8. Replace this with cuda::std::min once the underlying issue is resolved.
+#ifdef __CUDA_ARCH__
+  return scale_type{min(static_cast<int>(a), static_cast<int>(b))};
+#else
+  return std::min(a, b);
+#endif
+}
+
 /**
  * @brief A function for integer exponentiation by squaring.
  *
@@ -267,12 +285,12 @@ class fixed_point {
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U, typename cuda::std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
-  explicit constexpr operator U() const
+  CUDF_HOST_DEVICE explicit constexpr operator U() const
   {
     // Cast to the larger of the two types (of U and Rep) before converting to Rep because in
     // certain cases casting to U before shifting will result in integer overflow (i.e. if U =
     // int32_t, Rep = int64_t and _value > 2 billion)
-    auto const value = std::common_type_t<U, Rep>(_value);
+    auto const value = cuda::std::common_type_t<U, Rep>(_value);
     return static_cast<U>(detail::shift<Rep, Rad>(value, scale_type{-_scale}));
   }
 
@@ -669,7 +687,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   auto const sum   = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value;
 
 #if defined(__CUDACC_DEBUG__)
@@ -687,7 +705,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   auto const diff  = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value;
 
 #if defined(__CUDACC_DEBUG__)
@@ -735,7 +753,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator==(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value;
 }
 
@@ -744,7 +762,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value;
 }
 
@@ -753,7 +771,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value;
 }
 
@@ -762,7 +780,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value;
 }
 
@@ -771,7 +789,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator<(fixed_point<Rep1, Rad1> const& lhs,
                                        fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value;
 }
 
@@ -780,7 +798,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator>(fixed_point<Rep1, Rad1> const& lhs,
                                        fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
 }
 
@@ -789,7 +807,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator%(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale     = std::min(lhs._scale, rhs._scale);
+  auto const scale     = detail::min(lhs._scale, rhs._scale);
   auto const remainder = lhs.rescaled(scale)._value % rhs.rescaled(scale)._value;
   return fixed_point<Rep1, Rad1>{scaled_integer<Rep1>{remainder, scale}};
 }
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 307a52cd242..88034b4f804 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Computes the XXHash_32 hash value of each row in the given table
+ *
+ * This function computes the hash of each column using the `seed` for the first column
+ * and the resulting hash as a seed for the next column and so on.
+ * The result is a uint32 value for each row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> xxhash_32(
+  table_view const& input,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hash_functions.cuh b/cpp/include/cudf/hashing/detail/hash_functions.cuh
index 0ec41a20ef1..fd3455e761d 100644
--- a/cpp/include/cudf/hashing/detail/hash_functions.cuh
+++ b/cpp/include/cudf/hashing/detail/hash_functions.cuh
@@ -18,7 +18,8 @@
 
 #include <cudf/utilities/traits.hpp>
 
-#include <limits>
+#include <cuda/std/cmath>
+#include <cuda/std/limits>
 
 namespace cudf::hashing::detail {
 
@@ -29,7 +30,7 @@ template <typename T>
 T __device__ inline normalize_nans(T const& key)
 {
   if constexpr (cudf::is_floating_point<T>()) {
-    if (std::isnan(key)) { return std::numeric_limits<T>::quiet_NaN(); }
+    if (cuda::std::isnan(key)) { return cuda::std::numeric_limits<T>::quiet_NaN(); }
   }
   return key;
 }
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index a978e54a1b9..f796ff4526e 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr);
 
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view,
+                                  rmm::device_async_resource_ref mr);
+
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
@@ -82,7 +87,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
  * @param rhs The second hash value
  * @return Combined hash value
  */
-constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
+CUDF_HOST_DEVICE constexpr uint32_t hash_combine(uint32_t lhs, uint32_t rhs)
 {
   return lhs ^ (rhs + 0x9e37'79b9 + (lhs << 6) + (lhs >> 2));
 }
diff --git a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
index e0c7ce840d7..69edf38e359 100644
--- a/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
+++ b/cpp/include/cudf/hashing/detail/murmurhash3_x86_32.cuh
@@ -57,62 +57,71 @@ struct MurmurHash3_x86_32 {
 };
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<bool>::operator()(bool const& key) const
+MurmurHash3_x86_32<bool>::result_type __device__ inline MurmurHash3_x86_32<bool>::operator()(
+  bool const& key) const
 {
   return this->compute(static_cast<uint8_t>(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<float>::operator()(float const& key) const
+MurmurHash3_x86_32<float>::result_type __device__ inline MurmurHash3_x86_32<float>::operator()(
+  float const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<double>::operator()(double const& key) const
+MurmurHash3_x86_32<double>::result_type __device__ inline MurmurHash3_x86_32<double>::operator()(
+  double const& key) const
 {
   return this->compute(normalize_nans_and_zeros(key));
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
+MurmurHash3_x86_32<cudf::string_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::string_view>::operator()(
+    cudf::string_view const& key) const
 {
   return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
                              key.size_bytes());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
+MurmurHash3_x86_32<numeric::decimal32>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal32>::operator()(
+    numeric::decimal32 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
+MurmurHash3_x86_32<numeric::decimal64>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal64>::operator()(
+    numeric::decimal64 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
+MurmurHash3_x86_32<numeric::decimal128>::result_type
+  __device__ inline MurmurHash3_x86_32<numeric::decimal128>::operator()(
+    numeric::decimal128 const& key) const
 {
   return this->compute(key.value());
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
-  cudf::list_view const& key) const
+MurmurHash3_x86_32<cudf::list_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::list_view>::operator()(
+    cudf::list_view const& key) const
 {
   CUDF_UNREACHABLE("List column hashing is not supported");
 }
 
 template <>
-hash_value_type __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
-  cudf::struct_view const& key) const
+MurmurHash3_x86_32<cudf::struct_view>::result_type
+  __device__ inline MurmurHash3_x86_32<cudf::struct_view>::operator()(
+    cudf::struct_view const& key) const
 {
   CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
 }
diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
new file mode 100644
index 00000000000..bb6e7f18fbc
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
+
+namespace cudf::hashing::detail {
+
+template <typename Key>
+struct XXHash_32 {
+  using result_type = std::uint32_t;
+
+  CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
+
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
+  {
+    return this->_impl.compute_hash(bytes, size);
+  }
+
+ private:
+  template <typename T>
+  __device__ constexpr result_type compute(T const& key) const
+  {
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
+  }
+
+  cuco::xxhash_32<Key> _impl;
+};
+
+template <>
+XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
+{
+  return this->compute(static_cast<uint8_t>(key));
+}
+
+template <>
+XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
+  double const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<cudf::string_view>::result_type
+  __device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+{
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
+}
+
+template <>
+XXHash_32<numeric::decimal32>::result_type
+  __device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal64>::result_type
+  __device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal128>::result_type
+  __device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
+{
+  CUDF_UNREACHABLE("List column hashing is not supported");
+}
+
+template <>
+XXHash_32<cudf::struct_view>::result_type
+  __device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+{
+  CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index 0d74a4158ad..4ad760d278f 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -22,7 +22,7 @@
 #include <string>
 
 namespace CUDF_EXPORT cudf {
-namespace io::nvcomp {
+namespace io::detail::nvcomp {
 
 enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP };
 
@@ -88,5 +88,5 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
 [[nodiscard]] std::optional<std::string> is_decompression_disabled(
   compression_type compression, feature_status_parameters params = feature_status_parameters());
 
-}  // namespace io::nvcomp
+}  // namespace io::detail::nvcomp
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index bfe76d5690c..b561d0989e9 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -410,6 +410,7 @@ class parquet_reader_options_builder {
    *
    * @param val Boolean value whether to read matching projected and filter columns from mismatched
    * Parquet sources.
+   *
    * @return this for chaining.
    */
   parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val)
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index afefd04d4fa..cc63565eee1 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,13 +34,6 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @brief Enum to indicate whether the distinct join table has nested columns or not
- *
- * @ingroup column_join
- */
-enum class has_nested : bool { YES, NO };
-
 // forward declaration
 namespace hashing::detail {
 
@@ -61,7 +54,6 @@ class hash_join;
 /**
  * @brief Forward declaration for our distinct hash join
  */
-template <cudf::has_nested HasNested>
 class distinct_hash_join;
 }  // namespace detail
 
@@ -469,20 +461,19 @@ class hash_join {
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
-  const std::unique_ptr<impl_type const> _impl;
+  std::unique_ptr<impl_type const> _impl;
 };
 
 /**
  * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
  * `*_join` member functions
  *
+ * This class enables the distinct hash join scheme that builds hash table once, and probes as many
+ * times as needed (possibly in parallel).
+ *
  * @note Behavior is undefined if the build table contains duplicates.
  * @note All NaNs are considered as equal
- *
- * @tparam HasNested Flag indicating whether there are nested columns in build/probe table
  */
-// TODO: `HasNested` to be removed via dispatching
-template <cudf::has_nested HasNested>
 class distinct_hash_join {
  public:
   distinct_hash_join() = delete;
@@ -496,15 +487,10 @@ class distinct_hash_join {
    * @brief Constructs a distinct hash join object for subsequent probe calls
    *
    * @param build The build table that contains distinct elements
-   * @param probe The probe table, from which the keys are probed
-   * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or
-   *        any `probe` table that will be used later for join
    * @param compare_nulls Controls whether null join-key values should match or not
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   distinct_hash_join(cudf::table_view const& build,
-                     cudf::table_view const& probe,
-                     nullable_join has_nulls      = nullable_join::YES,
                      null_equality compare_nulls  = null_equality::EQUAL,
                      rmm::cuda_stream_view stream = cudf::get_default_stream());
 
@@ -512,16 +498,18 @@ class distinct_hash_join {
    * @brief Returns the row indices that can be used to construct the result of performing
    * an inner join between two tables. @see cudf::inner_join().
    *
+   * @param probe The probe table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned indices' device memory.
    *
-   * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to
+   * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to
    * construct the result of performing an inner join between two tables
    * with `build` and `probe` as the join keys.
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  inner_join(cudf::table_view const& probe,
+             rmm::cuda_stream_view stream      = cudf::get_default_stream(),
              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
@@ -532,19 +520,22 @@ class distinct_hash_join {
    * the row index of the matched row from the build table if there is a match. Otherwise, contains
    * `JoinNoneValue`.
    *
+   * @param probe The probe table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
+   *
    * @return A `build_indices` column that can be used to construct the result of
    * performing a left join between two tables with `build` and `probe` as the join
    * keys.
    */
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
+    cudf::table_view const& probe,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
-  using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
+  using impl_type = cudf::detail::distinct_hash_join;  ///< Implementation type
 
   std::unique_ptr<impl_type> _impl;  ///< Distinct hash join implementation
 };
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 85349a421b1..84957ab9f1d 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -31,7 +31,7 @@ namespace strings::detail {
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this is not the first byte of the character
  */
-constexpr bool is_utf8_continuation_char(unsigned char chr)
+CUDF_HOST_DEVICE constexpr bool is_utf8_continuation_char(unsigned char chr)
 {
   // The (0xC0 & 0x80) bit pattern identifies a continuation byte of a character.
   return (chr & 0xC0) == 0x80;
@@ -43,7 +43,10 @@ constexpr bool is_utf8_continuation_char(unsigned char chr)
  * @param chr Any single byte from a valid UTF-8 character
  * @return true if this the first byte of the character
  */
-constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_continuation_char(chr); }
+CUDF_HOST_DEVICE constexpr bool is_begin_utf8_char(unsigned char chr)
+{
+  return not is_utf8_continuation_char(chr);
+}
 
 /**
  * @brief This will return true if the passed in byte could be the start of
@@ -55,7 +58,7 @@ constexpr bool is_begin_utf8_char(unsigned char chr) { return not is_utf8_contin
  * @param byte The byte to be tested
  * @return true if this can be the first byte of a character
  */
-constexpr bool is_valid_begin_utf8_char(uint8_t byte)
+CUDF_HOST_DEVICE constexpr bool is_valid_begin_utf8_char(uint8_t byte)
 {
   // to be the first byte of a valid (up to 4 byte) UTF-8 char, byte must be one of:
   //  0b0vvvvvvv a 1 byte character
@@ -72,7 +75,7 @@ constexpr bool is_valid_begin_utf8_char(uint8_t byte)
  * @param character Single character
  * @return Number of bytes
  */
-constexpr size_type bytes_in_char_utf8(char_utf8 character)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_char_utf8(char_utf8 character)
 {
   return 1 + static_cast<size_type>((character & 0x0000'FF00u) > 0) +
          static_cast<size_type>((character & 0x00FF'0000u) > 0) +
@@ -89,7 +92,7 @@ constexpr size_type bytes_in_char_utf8(char_utf8 character)
  * @param byte Byte from an encoded character.
  * @return Number of bytes.
  */
-constexpr size_type bytes_in_utf8_byte(uint8_t byte)
+CUDF_HOST_DEVICE constexpr size_type bytes_in_utf8_byte(uint8_t byte)
 {
   return 1 + static_cast<size_type>((byte & 0xF0) == 0xF0)  // 4-byte character prefix
          + static_cast<size_type>((byte & 0xE0) == 0xE0)    // 3-byte character prefix
@@ -104,7 +107,7 @@ constexpr size_type bytes_in_utf8_byte(uint8_t byte)
  * @param[out] character Single char_utf8 value.
  * @return The number of bytes in the character
  */
-constexpr size_type to_char_utf8(char const* str, char_utf8& character)
+CUDF_HOST_DEVICE constexpr size_type to_char_utf8(char const* str, char_utf8& character)
 {
   size_type const chr_width = bytes_in_utf8_byte(static_cast<uint8_t>(*str));
 
@@ -131,7 +134,7 @@ constexpr size_type to_char_utf8(char const* str, char_utf8& character)
  * @param[out] str Output array.
  * @return The number of bytes in the character
  */
-constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
+CUDF_HOST_DEVICE constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
 {
   size_type const chr_width = bytes_in_char_utf8(character);
   for (size_type idx = 0; idx < chr_width; ++idx) {
@@ -148,7 +151,7 @@ constexpr inline size_type from_char_utf8(char_utf8 character, char* str)
  * @param utf8_char Single UTF-8 character to convert.
  * @return Code-point for the UTF-8 character.
  */
-constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
+CUDF_HOST_DEVICE constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
 {
   uint32_t unchr = 0;
   if (utf8_char < 0x0000'0080)  // single-byte pass thru
@@ -178,7 +181,7 @@ constexpr uint32_t utf8_to_codepoint(cudf::char_utf8 utf8_char)
  * @param unchr Character code-point to convert.
  * @return Single UTF-8 character.
  */
-constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
+CUDF_HOST_DEVICE constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
 {
   cudf::char_utf8 utf8 = 0;
   if (unchr < 0x0000'0080)  // single byte utf8
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 1ae4c3703b2..f0040e069d8 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -31,6 +31,8 @@
 #include <thrust/execution_policy.h>
 #endif
 
+#include <cuda/std/utility>
+
 #include <algorithm>
 
 // This file should only include device code logic.
@@ -75,8 +77,8 @@ __device__ inline size_type characters_in_string(char const* str, size_type byte
  * @param pos Character position to count to
  * @return The number of bytes and the left over non-counted position value
  */
-__device__ inline std::pair<size_type, size_type> bytes_to_character_position(string_view d_str,
-                                                                              size_type pos)
+__device__ inline cuda::std::pair<size_type, size_type> bytes_to_character_position(
+  string_view d_str, size_type pos)
 {
   size_type bytes    = 0;
   auto ptr           = d_str.data();
@@ -303,7 +305,7 @@ __device__ inline char_utf8 string_view::operator[](size_type pos) const
 __device__ inline size_type string_view::byte_offset(size_type pos) const
 {
   if (length() == size_bytes()) return pos;
-  return std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
+  return cuda::std::get<0>(strings::detail::bytes_to_character_position(*this, pos));
 }
 
 __device__ inline int string_view::compare(string_view const& in) const
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 504c31057ae..33f3176d2c6 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -54,7 +54,7 @@ class string_view {
    *
    * @return The number of characters in this string
    */
-  __device__ [[nodiscard]] inline size_type length() const;
+  [[nodiscard]] __device__ inline size_type length() const;
   /**
    * @brief Return a pointer to the internal device array
    *
@@ -119,13 +119,13 @@ class string_view {
    *
    * @return new iterator pointing to the beginning of this string
    */
-  __device__ [[nodiscard]] inline const_iterator begin() const;
+  [[nodiscard]] __device__ inline const_iterator begin() const;
   /**
    * @brief Return new iterator pointing past the end of this string
    *
    * @return new iterator pointing past the end of this string
    */
-  __device__ [[nodiscard]] inline const_iterator end() const;
+  [[nodiscard]] __device__ inline const_iterator end() const;
 
   /**
    * @brief Return single UTF-8 character at the given character position
@@ -140,7 +140,7 @@ class string_view {
    * @param pos Character position
    * @return Byte offset from data() for a given character position
    */
-  __device__ [[nodiscard]] inline size_type byte_offset(size_type pos) const;
+  [[nodiscard]] __device__ inline size_type byte_offset(size_type pos) const;
 
   /**
    * @brief Comparing target string with this string. Each character is compared
@@ -155,7 +155,7 @@ class string_view {
    *            not match is greater in the arg string, or all compared characters
    *            match but the arg string is longer.
    */
-  __device__ [[nodiscard]] inline int compare(string_view const& str) const;
+  [[nodiscard]] __device__ inline int compare(string_view const& str) const;
   /**
    * @brief Comparing target string with this string. Each character is compared
    * as a UTF-8 code-point value.
@@ -225,7 +225,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if str is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type find(string_view const& str,
+  [[nodiscard]] __device__ inline size_type find(string_view const& str,
                                                  size_type pos   = 0,
                                                  size_type count = -1) const;
   /**
@@ -253,7 +253,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type find(char_utf8 character,
+  [[nodiscard]] __device__ inline size_type find(char_utf8 character,
                                                  size_type pos   = 0,
                                                  size_type count = -1) const;
   /**
@@ -266,7 +266,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type rfind(string_view const& str,
+  [[nodiscard]] __device__ inline size_type rfind(string_view const& str,
                                                   size_type pos   = 0,
                                                   size_type count = -1) const;
   /**
@@ -294,7 +294,7 @@ class string_view {
    *              Specify -1 to indicate to the end of the string.
    * @return npos if arg string is not found in this string.
    */
-  __device__ [[nodiscard]] inline size_type rfind(char_utf8 character,
+  [[nodiscard]] __device__ inline size_type rfind(char_utf8 character,
                                                   size_type pos   = 0,
                                                   size_type count = -1) const;
 
@@ -306,7 +306,7 @@ class string_view {
    * @param length Number of characters from start to include in the sub-string.
    * @return New instance pointing to a subset of the characters within this instance.
    */
-  __device__ [[nodiscard]] inline string_view substr(size_type start, size_type length) const;
+  [[nodiscard]] __device__ inline string_view substr(size_type start, size_type length) const;
 
   /**
    * @brief Return minimum value associated with the string type
@@ -386,7 +386,7 @@ class string_view {
    * @param bytepos Byte position from start of _data.
    * @return The character position for the specified byte.
    */
-  __device__ [[nodiscard]] inline size_type character_offset(size_type bytepos) const;
+  [[nodiscard]] __device__ inline size_type character_offset(size_type bytepos) const;
 
   /**
    * @brief Common internal implementation for string_view::find and string_view::rfind.
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 3f33c70c29a..8214ea6e83b 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -33,6 +33,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <cuda/std/tuple>
 #include <cuda/std/utility>
 #include <thrust/detail/use_default.h>
@@ -48,11 +50,8 @@
 #include <thrust/swap.h>
 #include <thrust/transform_reduce.h>
 
-#include <limits>
 #include <memory>
-#include <optional>
 #include <type_traits>
-#include <utility>
 
 namespace CUDF_EXPORT cudf {
 
@@ -287,15 +286,16 @@ class device_row_comparator {
    * `null_order::BEFORE` for all columns.
    * @param comparator Physical element relational comparison functor.
    */
-  device_row_comparator(Nullate check_nulls,
-                        table_device_view lhs,
-                        table_device_view rhs,
-                        device_span<detail::dremel_device_view const> l_dremel_device_views,
-                        device_span<detail::dremel_device_view const> r_dremel_device_views,
-                        std::optional<device_span<int const>> depth                  = std::nullopt,
-                        std::optional<device_span<order const>> column_order         = std::nullopt,
-                        std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-                        PhysicalElementComparator comparator                         = {}) noexcept
+  device_row_comparator(
+    Nullate check_nulls,
+    table_device_view lhs,
+    table_device_view rhs,
+    device_span<detail::dremel_device_view const> l_dremel_device_views,
+    device_span<detail::dremel_device_view const> r_dremel_device_views,
+    cuda::std::optional<device_span<int const>> depth                  = cuda::std::nullopt,
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel(l_dremel_device_views),
@@ -331,9 +331,9 @@ class device_row_comparator {
     Nullate check_nulls,
     table_device_view lhs,
     table_device_view rhs,
-    std::optional<device_span<order const>> column_order         = std::nullopt,
-    std::optional<device_span<null_order const>> null_precedence = std::nullopt,
-    PhysicalElementComparator comparator                         = {}) noexcept
+    cuda::std::optional<device_span<order const>> column_order         = cuda::std::nullopt,
+    cuda::std::optional<device_span<null_order const>> null_precedence = cuda::std::nullopt,
+    PhysicalElementComparator comparator                               = {}) noexcept
     : _lhs{lhs},
       _rhs{rhs},
       _l_dremel{},
@@ -410,7 +410,7 @@ class device_row_comparator {
 
       return cuda::std::pair(_comparator(_lhs.element<Element>(lhs_element_index),
                                          _rhs.element<Element>(rhs_element_index)),
-                             std::numeric_limits<int>::max());
+                             cuda::std::numeric_limits<int>::max());
     }
 
     /**
@@ -455,7 +455,7 @@ class device_row_comparator {
         }
 
         if (lcol.num_child_columns() == 0) {
-          return cuda::std::pair(weak_ordering::EQUIVALENT, std::numeric_limits<int>::max());
+          return cuda::std::pair(weak_ordering::EQUIVALENT, cuda::std::numeric_limits<int>::max());
         }
 
         // Non-empty structs have been modified to only have 1 child when using this.
@@ -607,7 +607,7 @@ class device_row_comparator {
   __device__ constexpr weak_ordering operator()(size_type const lhs_index,
                                                 size_type const rhs_index) const noexcept
   {
-    int last_null_depth = std::numeric_limits<int>::max();
+    int last_null_depth = cuda::std::numeric_limits<int>::max();
     size_type list_column_index{-1};
     for (size_type i = 0; i < _lhs.num_columns(); ++i) {
       if (_lhs.column(i).type().id() == type_id::LIST) { ++list_column_index; }
@@ -626,9 +626,9 @@ class device_row_comparator {
       // here, otherwise the current code would be failing.
       auto const [l_dremel_i, r_dremel_i] =
         _lhs.column(i).type().id() == type_id::LIST
-          ? std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
-                            optional_dremel_view(_r_dremel[list_column_index]))
-          : std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
+          ? cuda::std::make_tuple(optional_dremel_view(_l_dremel[list_column_index]),
+                                  optional_dremel_view(_r_dremel[list_column_index]))
+          : cuda::std::make_tuple(optional_dremel_view{}, optional_dremel_view{});
 
       auto element_comp = element_comparator{_check_nulls,
                                              _lhs.column(i),
@@ -658,9 +658,9 @@ class device_row_comparator {
   device_span<detail::dremel_device_view const> const _l_dremel;
   device_span<detail::dremel_device_view const> const _r_dremel;
   Nullate const _check_nulls;
-  std::optional<device_span<int const>> const _depth;
-  std::optional<device_span<order const>> const _column_order;
-  std::optional<device_span<null_order const>> const _null_precedence;
+  cuda::std::optional<device_span<int const>> const _depth;
+  cuda::std::optional<device_span<order const>> const _column_order;
+  cuda::std::optional<device_span<null_order const>> const _null_precedence;
   PhysicalElementComparator const _comparator;
 };  // class device_row_comparator
 
@@ -882,10 +882,10 @@ struct preprocessed_table {
    * @return Device array containing respective column orders. If no explicit column orders were
    * specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<order const>> column_order() const
+  [[nodiscard]] cuda::std::optional<device_span<order const>> column_order() const
   {
-    return _column_order.size() ? std::optional<device_span<order const>>(_column_order)
-                                : std::nullopt;
+    return _column_order.size() ? cuda::std::optional<device_span<order const>>(_column_order)
+                                : cuda::std::nullopt;
   }
 
   /**
@@ -895,10 +895,11 @@ struct preprocessed_table {
    * @return Device array containing respective column null precedence. If no explicit column null
    * precedences were specified during the creation of this object then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<null_order const>> null_precedence() const
+  [[nodiscard]] cuda::std::optional<device_span<null_order const>> null_precedence() const
   {
-    return _null_precedence.size() ? std::optional<device_span<null_order const>>(_null_precedence)
-                                   : std::nullopt;
+    return _null_precedence.size()
+             ? cuda::std::optional<device_span<null_order const>>(_null_precedence)
+             : cuda::std::nullopt;
   }
 
   /**
@@ -909,9 +910,10 @@ struct preprocessed_table {
    * @return std::optional<device_span<int const>> Device array containing respective column depths.
    * If there are no nested columns in the table then this will be `nullopt`.
    */
-  [[nodiscard]] std::optional<device_span<int const>> depths() const
+  [[nodiscard]] cuda::std::optional<device_span<int const>> depths() const
   {
-    return _depths.size() ? std::optional<device_span<int const>>(_depths) : std::nullopt;
+    return _depths.size() ? cuda::std::optional<device_span<int const>>(_depths)
+                          : cuda::std::nullopt;
   }
 
   [[nodiscard]] device_span<detail::dremel_device_view const> dremel_device_views() const
@@ -940,8 +942,8 @@ struct preprocessed_table {
   rmm::device_uvector<size_type> const _depths;
 
   // Dremel encoding of list columns used for the comparison algorithm
-  std::optional<std::vector<detail::dremel_data>> _dremel_data;
-  std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
+  cuda::std::optional<std::vector<detail::dremel_data>> _dremel_data;
+  cuda::std::optional<rmm::device_uvector<detail::dremel_device_view>> _dremel_device_views;
 
   // Intermediate columns generated from transforming nested children columns into
   // integers columns using `cudf::rank()`, need to be kept alive.
@@ -1808,7 +1810,7 @@ class element_hasher {
   __device__ element_hasher(
     Nullate nulls,
     uint32_t seed             = DEFAULT_HASH_SEED,
-    hash_value_type null_hash = std::numeric_limits<hash_value_type>::max()) noexcept
+    hash_value_type null_hash = cuda::std::numeric_limits<hash_value_type>::max()) noexcept
     : _check_nulls(nulls), _seed(seed), _null_hash(null_hash)
   {
   }
@@ -1892,7 +1894,7 @@ class device_row_hasher {
    */
   template <template <typename> class hash_fn>
   class element_hasher_adapter {
-    static constexpr hash_value_type NULL_HASH     = std::numeric_limits<hash_value_type>::max();
+    static constexpr hash_value_type NULL_HASH = cuda::std::numeric_limits<hash_value_type>::max();
     static constexpr hash_value_type NON_NULL_HASH = 0;
 
    public:
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 16d532ea2b8..4f6238b5fe7 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -16,6 +16,8 @@
 #pragma once
 
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -251,7 +253,7 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   // A buffer of CPU memory is allocated to hold the ColumnDeviceView
   // objects. Once filled, the CPU memory is then copied to device memory
   // and the pointer is set in the d_columns member.
-  std::vector<int8_t> h_buffer(padded_views_size_bytes);
+  auto h_buffer = cudf::detail::make_host_vector<int8_t>(padded_views_size_bytes, stream);
   // Each ColumnDeviceView instance may have child objects which may
   // require setting some internal device pointers before being copied
   // from CPU to device.
@@ -266,8 +268,10 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   auto d_columns = detail::child_columns_to_device_array<ColumnDeviceView>(
     source_view.begin(), source_view.end(), h_ptr, d_ptr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(d_ptr, h_ptr, views_size_bytes, cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  auto const h_span = host_span<int8_t const>{h_buffer}.subspan(
+    static_cast<int8_t const*>(h_ptr) - h_buffer.data(), views_size_bytes);
+  auto const d_span = device_span<int8_t>{static_cast<int8_t*>(d_ptr), views_size_bytes};
+  cudf::detail::cuda_memcpy(d_span, h_span, stream);
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index 409b8c825bb..9443bd5cb52 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -266,7 +266,7 @@ class data_type {
    *
    * @param id The type's identifier
    */
-  explicit constexpr data_type(type_id id) : _id{id} {}
+  CUDF_HOST_DEVICE explicit constexpr data_type(type_id id) : _id{id} {}
 
   /**
    * @brief Construct a new `data_type` object for `numeric::fixed_point`
@@ -284,14 +284,17 @@ class data_type {
    *
    * @return The type identifier
    */
-  [[nodiscard]] constexpr type_id id() const noexcept { return _id; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr type_id id() const noexcept { return _id; }
 
   /**
    * @brief Returns the scale (for fixed_point types)
    *
    * @return The scale
    */
-  [[nodiscard]] constexpr int32_t scale() const noexcept { return _fixed_point_scale; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t scale() const noexcept
+  {
+    return _fixed_point_scale;
+  }
 
  private:
   type_id _id{type_id::EMPTY};
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
deleted file mode 100644
index 982554a23f5..00000000000
--- a/cpp/include/cudf/utilities/logger.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/export.hpp>
-
-#include <spdlog/spdlog.h>
-
-namespace CUDF_EXPORT cudf {
-
-namespace detail {
-spdlog::logger& logger();
-}
-
-/**
- * @brief Returns the global logger.
- *
- * This is a global instance of a spdlog logger. It can be used to configure logging behavior in
- * libcudf.
- *
- * Examples:
- * @code{.cpp}
- * // Turn off logging at runtime
- * cudf::logger().set_level(spdlog::level::off);
- * // Add a stdout sink to the logger
- * cudf::logger().sinks().push_back(std::make_shared<spdlog::sinks::stdout_sink_mt>());
- * // Replace the default sink
- * cudf::logger().sinks() ={std::make_shared<spdlog::sinks::stderr_sink_mt>()};
- * @endcode
- *
- * Note: Changes to the sinks are not thread safe and should only be done during global
- * initialization.
- *
- * @return spdlog::logger& The logger.
- */
-[[deprecated(
-  "Support for direct access to spdlog loggers in cudf is planned for removal")]] spdlog::logger&
-logger();
-
-}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/memory_resource.hpp b/cpp/include/cudf/utilities/memory_resource.hpp
index b562574fd79..eaba466557b 100644
--- a/cpp/include/cudf/utilities/memory_resource.hpp
+++ b/cpp/include/cudf/utilities/memory_resource.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/memory_resource.hpp>
-
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 21ee4fa9e9b..e7b76946248 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -69,52 +70,22 @@ class span_base {
 
   static constexpr std::size_t extent = Extent;  ///< The extent of the span
 
-  constexpr span_base() noexcept {}
+  CUDF_HOST_DEVICE constexpr span_base() noexcept {}
   /**
    * @brief Constructs a span from a pointer and a size.
    *
    * @param data Pointer to the first element in the span.
    * @param size The number of elements in the span.
    */
-  constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
+  CUDF_HOST_DEVICE constexpr span_base(pointer data, size_type size) : _data(data), _size(size) {}
   // constexpr span_base(pointer begin, pointer end) : _data(begin), _size(end - begin) {}
-  constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr span_base(span_base const&) noexcept = default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
    * @return Reference to this span.
    */
-  constexpr span_base& operator=(span_base const&) noexcept = default;
-
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the first element in the span.
-   *
-   * Calling front on an empty span results in undefined behavior.
-   *
-   * @return Reference to the first element in the span
-   */
-  [[nodiscard]] constexpr reference front() const { return _data[0]; }
-  // not noexcept due to undefined behavior when size = 0
-  /**
-   * @brief Returns a reference to the last element in the span.
-   *
-   * Calling last on an empty span results in undefined behavior.
-   *
-   * @return Reference to the last element in the span
-   */
-  [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; }
-  // not noexcept due to undefined behavior when idx < 0 || idx >= size
-  /**
-   * @brief Returns a reference to the idx-th element of the sequence.
-   *
-   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
-   * size()).
-   *
-   * @param idx the index of the element to access
-   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
-   */
-  constexpr reference operator[](size_type idx) const { return _data[idx]; }
+  CUDF_HOST_DEVICE constexpr span_base& operator=(span_base const&) noexcept = default;
 
   /**
    * @brief Returns an iterator to the first element of the span.
@@ -123,7 +94,7 @@ class span_base {
    *
    * @return An iterator to the first element of the span
    */
-  [[nodiscard]] constexpr iterator begin() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator begin() const noexcept { return _data; }
   /**
    * @brief Returns an iterator to the element following the last element of the span.
    *
@@ -131,32 +102,36 @@ class span_base {
    *
    * @return An iterator to the element following the last element of the span
    */
-  [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr iterator end() const noexcept { return _data + _size; }
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  [[nodiscard]] constexpr pointer data() const noexcept { return _data; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr pointer data() const noexcept { return _data; }
 
   /**
    * @brief Returns the number of elements in the span.
    *
    * @return The number of elements in the span
    */
-  [[nodiscard]] constexpr size_type size() const noexcept { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size() const noexcept { return _size; }
   /**
    * @brief Returns the size of the sequence in bytes.
    *
    * @return The size of the sequence in bytes
    */
-  [[nodiscard]] constexpr size_type size_bytes() const noexcept { return sizeof(T) * _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr size_type size_bytes() const noexcept
+  {
+    return sizeof(T) * _size;
+  }
+
   /**
    * @brief Checks if the span is empty.
    *
    * @return True if the span is empty, false otherwise
    */
-  [[nodiscard]] constexpr bool empty() const noexcept { return _size == 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool empty() const noexcept { return _size == 0; }
 
   /**
    * @brief Obtains a subspan consisting of the first N elements of the sequence
@@ -180,9 +155,9 @@ class span_base {
     return Derived(_data + _size - count, count);
   }
 
- private:
-  pointer _data{nullptr};
-  size_type _size{0};
+ protected:
+  pointer _data{nullptr};  ///< Pointer to the first element in the span
+  size_type _size{0};      ///< The number of elements in the span
 };
 
 }  // namespace detail
@@ -288,6 +263,39 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size()), _is_device_accessible{other.is_device_accessible()}
   {
   }
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  constexpr typename base::reference operator[](size_type idx) const { return this->_data[idx]; }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference front() const { return this->_data[0]; }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
 
   /**
    * @brief Returns whether the data is device accessible (e.g. pinned memory)
@@ -339,7 +347,7 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
   using base = cudf::detail::span_base<T, Extent, device_span<T, Extent>>;  ///< Base type
   using base::base;
 
-  constexpr device_span() noexcept : base() {}  // required to compile on centos
+  CUDF_HOST_DEVICE constexpr device_span() noexcept : base() {}  // required to compile on centos
 
   /// Constructor from container
   /// @param in The container to construct the span from
@@ -374,11 +382,51 @@ struct device_span : public cudf::detail::span_base<T, Extent, device_span<T, Ex
             std::enable_if_t<(Extent == OtherExtent || Extent == dynamic_extent) &&
                                std::is_convertible_v<OtherT (*)[], T (*)[]>,  // NOLINT
                              void>* = nullptr>
-  constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
+  CUDF_HOST_DEVICE constexpr device_span(device_span<OtherT, OtherExtent> const& other) noexcept
     : base(other.data(), other.size())
   {
   }
 
+  // not noexcept due to undefined behavior when idx < 0 || idx >= size
+  /**
+   * @brief Returns a reference to the idx-th element of the sequence.
+   *
+   * The behavior is undefined if idx is out of range (i.e., if it is greater than or equal to
+   * size()).
+   *
+   * @param idx the index of the element to access
+   * @return A reference to the idx-th element of the sequence, i.e., `data()[idx]`
+   */
+  __device__ constexpr typename base::reference operator[](size_type idx) const
+  {
+    return this->_data[idx];
+  }
+
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the first element in the span.
+   *
+   * Calling front on an empty span results in undefined behavior.
+   *
+   * @return Reference to the first element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference front() const
+  {
+    return this->_data[0];
+  }
+  // not noexcept due to undefined behavior when size = 0
+  /**
+   * @brief Returns a reference to the last element in the span.
+   *
+   * Calling last on an empty span results in undefined behavior.
+   *
+   * @return Reference to the last element in the span
+   */
+  [[nodiscard]] __device__ constexpr typename base::reference back() const
+  {
+    return this->_data[this->_size - 1];
+  }
+
   /**
    * @brief Obtains a span that is a view over the `count` elements of this span starting at offset
    *
@@ -417,7 +465,9 @@ class base_2dspan {
   constexpr base_2dspan(RowType<T, dynamic_extent> flat_view, size_t columns)
     : _flat{flat_view}, _size{columns == 0 ? 0 : flat_view.size() / columns, columns}
   {
+#ifndef __CUDA_ARCH__
     CUDF_EXPECTS(_size.first * _size.second == flat_view.size(), "Invalid 2D span size");
+#endif
   }
 
   /**
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index dae1cd38832..0f4bde204fa 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -169,7 +169,7 @@ bool is_equality_comparable(data_type type);
  * @return false  `T` is not numeric
  */
 template <typename T>
-constexpr inline bool is_numeric()
+CUDF_HOST_DEVICE constexpr inline bool is_numeric()
 {
   return cuda::std::is_arithmetic<T>();
 }
@@ -271,9 +271,9 @@ bool is_unsigned(data_type type);
  * @return true if the iterator's value type is unsigned
  */
 template <typename Iterator>
-constexpr inline bool is_signed_iterator()
+CUDF_HOST_DEVICE constexpr inline bool is_signed_iterator()
 {
-  return std::is_signed_v<typename std::iterator_traits<Iterator>::value_type>;
+  return cuda::std::is_signed_v<typename cuda::std::iterator_traits<Iterator>::value_type>;
 }
 
 /**
@@ -356,9 +356,9 @@ bool is_numeric_not_bool(data_type type);
  * @return false  `T` is not floating point
  */
 template <typename T>
-constexpr inline bool is_floating_point()
+CUDF_HOST_DEVICE constexpr inline bool is_floating_point()
 {
-  return std::is_floating_point_v<T>;
+  return cuda::std::is_floating_point_v<T>;
 }
 
 /**
@@ -415,7 +415,7 @@ bool is_boolean(data_type type);
  * @return false  `T` is not a timestamp
  */
 template <typename T>
-constexpr inline bool is_timestamp()
+CUDF_HOST_DEVICE constexpr inline bool is_timestamp()
 {
   return is_timestamp_t<T>::value;
 }
@@ -439,13 +439,14 @@ bool is_timestamp(data_type type);
  * @return false  `T` is not a fixed-point type
  */
 template <typename T>
-constexpr inline bool is_fixed_point()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_point()
 {
-  return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T> ||
-         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
-         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
+  return cuda::std::is_same_v<numeric::decimal32, T> ||
+         cuda::std::is_same_v<numeric::decimal64, T> ||
+         cuda::std::is_same_v<numeric::decimal128, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         cuda::std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
@@ -465,7 +466,7 @@ bool is_fixed_point(data_type type);
  * @return false  `T` is not a duration
  */
 template <typename T>
-constexpr inline bool is_duration()
+CUDF_HOST_DEVICE constexpr inline bool is_duration()
 {
   return is_duration_t<T>::value;
 }
@@ -489,7 +490,7 @@ bool is_duration(data_type type);
  * @return false  `T` is neither a duration nor a timestamp type
  */
 template <typename T>
-constexpr inline bool is_chrono()
+CUDF_HOST_DEVICE constexpr inline bool is_chrono()
 {
   return is_duration<T>() || is_timestamp<T>();
 }
@@ -557,7 +558,7 @@ bool is_dictionary(data_type type);
  * @return false `T` corresponds to a variable-width element type
  */
 template <typename T>
-constexpr inline bool is_fixed_width()
+CUDF_HOST_DEVICE constexpr inline bool is_fixed_width()
 {
   // TODO Add fixed width wrapper types
   // Is a category fixed width?
@@ -590,10 +591,11 @@ class string_view;
  * @return false `T` corresponds to a "simple" type
  */
 template <typename T>
-constexpr inline bool is_compound()
+CUDF_HOST_DEVICE constexpr inline bool is_compound()
 {
-  return std::is_same_v<T, cudf::string_view> or std::is_same_v<T, cudf::dictionary32> or
-         std::is_same_v<T, cudf::list_view> or std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::string_view> or
+         cuda::std::is_same_v<T, cudf::dictionary32> or cuda::std::is_same_v<T, cudf::list_view> or
+         cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
@@ -622,9 +624,9 @@ bool is_compound(data_type type);
  * @return false T is not a nested type
  */
 template <typename T>
-constexpr inline bool is_nested()
+CUDF_HOST_DEVICE constexpr inline bool is_nested()
 {
-  return std::is_same_v<T, cudf::list_view> || std::is_same_v<T, cudf::struct_view>;
+  return cuda::std::is_same_v<T, cudf::list_view> || cuda::std::is_same_v<T, cudf::struct_view>;
 }
 
 /**
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 6351a84e38f..c1dd79ef14f 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf {
  * @return The `cudf::type_id` corresponding to the specified type
  */
 template <typename T>
-inline constexpr type_id type_to_id()
+CUDF_HOST_DEVICE inline constexpr type_id type_to_id()
 {
   return type_id::EMPTY;
 };
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index b2c1a23f57e..f0d5d9ecb5d 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -31,69 +31,6 @@ namespace CUDF_EXPORT nvtext {
  * @file
  */
 
-/**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint32_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash(
-  cudf::strings_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width             = 4,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
-
 /**
  * @brief Returns the minhash values for each string
  *
@@ -132,7 +69,7 @@ namespace CUDF_EXPORT nvtext {
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash_permuted(
+std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   uint32_t seed,
   cudf::device_span<uint32_t const> parameter_a,
@@ -142,67 +79,16 @@ std::unique_ptr<cudf::column> minhash_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash value for each string
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
- * The hash function returns 2 uint64 values but only the first value
- * is used with the minhash calculation.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if the width < 2
- *
- * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Minhash values as UINT64 for each string in input
- */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
-  cudf::strings_column_view const& input,
-  cudf::numeric_scalar<uint64_t> seed = 0,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
-
-/**
- * @brief Returns the minhash values for each string per seed
- *
- * Hash values are computed from substrings of each string and the
- * minimum hash value is returned for each string for each seed.
- * Each row of the list column are seed results for the corresponding
- * string. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * @copydoc nvtext::minhash
  *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12 - to be replaced in a future release
- *
- * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Strings column to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param width The character width used for apply substrings;
- *              Default is 4 characters.
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash()
  */
-[[deprecated]] std::unique_ptr<cudf::column> minhash64(
+[[deprecated]] std::unique_ptr<cudf::column> minhash_permuted(
   cudf::strings_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width             = 4,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
@@ -244,7 +130,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
  */
-std::unique_ptr<cudf::column> minhash64_permuted(
+std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   uint64_t seed,
   cudf::device_span<uint64_t const> parameter_a,
@@ -254,64 +140,18 @@ std::unique_ptr<cudf::column> minhash64_permuted(
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x86_32 for the hash algorithm.
- *
- * Any null row entries result in corresponding null output rows.
+ * @copydoc nvtext::minhash64
  *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
+ * @deprecated Use nvtext::minhash64()
  */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint32_t const> seeds,
+[[deprecated]] std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
-/**
- * @brief Returns the minhash values for each row of strings per seed
- *
- * Hash values are computed from each string in each row and the
- * minimum hash value is returned for each row for each seed.
- * Each row of the output list column are seed results for the corresponding
- * input row. The order of the elements in each row match the order of
- * the seeds provided in the `seeds` parameter.
- *
- * This function uses MurmurHash3_x64_128 for the hash algorithm though
- * only the first 64-bits of the hash are used in computing the output.
- *
- * Any null row entries result in corresponding null output rows.
- *
- * @deprecated Deprecated in 24.12
- *
- * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
- *
- * @param input Lists column of strings to compute minhash
- * @param seeds Seed values used for the hash algorithm
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return List column of minhash values for each string per seed
- */
-[[deprecated]] std::unique_ptr<cudf::column> word_minhash64(
-  cudf::lists_column_view const& input,
-  cudf::device_span<uint64_t const> seeds,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 42f84e4d0c7..e111367d191 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,8 +1,9 @@
 #
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 #
 import argparse
 import os
+import re
 import sys
 import xml.etree.ElementTree as ET
 from pathlib import Path
@@ -144,6 +145,16 @@ def format_file_size(input_size):
     return file_size_str
 
 
+def replace_placeholder_patterns(input_string: str) -> str:
+    pattern = r'(_h_env_placehold)[_placehold]+'
+    return re.sub(pattern, r'\1...', input_string)
+
+
+# adjust name for display
+def format_file_name(name: str) -> str:
+    return replace_placeholder_patterns(name)
+
+
 # Output chart results in HTML format
 # Builds a standalone html file with no javascript or styles
 def output_html(entries, sorted_list, cmp_entries, args):
@@ -223,7 +234,8 @@ def output_html(entries, sorted_list, cmp_entries, args):
             print("<td height='20px' width='", size, "px' ", sep="", end="")
             # title text is shown as hover-text by most browsers
             print(color, "title='", end="")
-            print(name, "\n", build_time_str, "' ", sep="", end="")
+            display_name = format_file_name(name)
+            print(display_name, "\n", build_time_str, "' ", sep="", end="")
             # centers the name if it fits in the box
             print("align='center' nowrap>", end="")
             # use a slightly smaller, fixed-width font
@@ -265,7 +277,8 @@ def output_html(entries, sorted_list, cmp_entries, args):
         file_size_str = format_file_size(file_size)
 
         # output entry row
-        print("<tr ", color, "><td>", name, "</td>", sep="", end="")
+        display_name = format_file_name(name)
+        print("<tr ", color, "><td>", display_name, "</td>", sep="", end="")
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index a60a7f63882..0d4400b891b 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -237,6 +237,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, host_udf_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 // aggregation_finalizer ----------------------------------------
 
 void aggregation_finalizer::visit(aggregation const& agg) {}
@@ -410,6 +416,11 @@ void aggregation_finalizer::visit(merge_tdigest_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(host_udf_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 }  // namespace detail
 
 std::vector<std::unique_ptr<aggregation>> aggregation::get_simple_aggregations(
diff --git a/cpp/src/aggregation/aggregation.cu b/cpp/src/aggregation/aggregation.cu
index d915c85bf85..3a6ff36c424 100644
--- a/cpp/src/aggregation/aggregation.cu
+++ b/cpp/src/aggregation/aggregation.cu
@@ -17,15 +17,14 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <vector>
-
 namespace cudf {
 namespace detail {
 void initialize_with_identity(mutable_table_view& table,
-                              std::vector<aggregation::Kind> const& aggs,
+                              host_span<cudf::aggregation::Kind const> aggs,
                               rmm::cuda_stream_view stream)
 {
   // TODO: Initialize all the columns in a single kernel instead of invoking one
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index ec63504a414..2f255e7a07c 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,15 +27,17 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace binops {
 namespace compiled {
 
 template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-constexpr bool is_bool_result()
+CUDF_HOST_DEVICE constexpr bool is_bool_result()
 {
-  using ReturnType = std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
-  return std::is_same_v<bool, ReturnType>;
+  using ReturnType = cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>;
+  return cuda::std::is_same_v<bool, ReturnType>;
 }
 
 /**
@@ -51,7 +53,7 @@ struct type_casted_accessor {
   {
     if constexpr (column_device_view::has_element_accessor<Element>()) {
       auto const element = col.element<Element>(is_scalar ? 0 : i);
-      if constexpr (std::is_convertible_v<Element, CastType>) {
+      if constexpr (cuda::std::is_convertible_v<Element, CastType>) {
         return static_cast<CastType>(element);
       } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
         return convert_fixed_to_floating<CastType>(element);
@@ -75,7 +77,7 @@ struct typed_casted_writer {
                                     FromType val) const
   {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
-                  std::is_constructible_v<Element, FromType>) {
+                  cuda::std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
     } else if constexpr (is_fixed_point<Element>()) {
       auto const scale = numeric::scale_type{col.type().scale()};
@@ -109,18 +111,18 @@ struct ops_wrapper {
   template <typename TypeCommon>
   __device__ void operator()(size_type i)
   {
-    if constexpr (std::is_invocable_v<BinaryOperator, TypeCommon, TypeCommon>) {
+    if constexpr (cuda::std::is_invocable_v<BinaryOperator, TypeCommon, TypeCommon>) {
       TypeCommon x =
         type_dispatcher(lhs.type(), type_casted_accessor<TypeCommon>{}, i, lhs, is_lhs_scalar);
       TypeCommon y =
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
-        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
-                      std::is_same_v<BinaryOperator, ops::NullMax> or
-                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+        if constexpr (cuda::std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullNotEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMax> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
           auto result       = BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(
             x,
@@ -134,7 +136,7 @@ struct ops_wrapper {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
         // To suppress nvcc warning
-        return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
+        return cuda::std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
         out.element<decltype(result)>(i) = result;
@@ -161,16 +163,16 @@ struct ops2_wrapper {
   __device__ void operator()(size_type i)
   {
     if constexpr (!has_common_type_v<TypeLhs, TypeRhs> and
-                  std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
+                  cuda::std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
       TypeLhs x   = lhs.element<TypeLhs>(is_lhs_scalar ? 0 : i);
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
-        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
-                      std::is_same_v<BinaryOperator, ops::NullMax> or
-                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+        if constexpr (cuda::std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullNotEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMax> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
           auto result       = BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(
             x,
@@ -184,7 +186,7 @@ struct ops2_wrapper {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
         // To suppress nvcc warning
-        return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
+        return cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
         out.element<decltype(result)>(i) = result;
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index 7eb80c4249e..d36dacca739 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/detail/is_element_valid.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -30,15 +31,14 @@ bool is_element_valid_sync(column_view const& col_view,
   CUDF_EXPECTS(element_index >= 0 and element_index < col_view.size(), "invalid index.");
   if (!col_view.nullable()) { return true; }
 
-  bitmask_type word = 0;
   // null_mask() returns device ptr to bitmask without offset
   size_type const index = element_index + col_view.offset();
-  CUDF_CUDA_TRY(cudaMemcpyAsync(&word,
-                                col_view.null_mask() + word_index(index),
-                                sizeof(bitmask_type),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
+
+  auto const word =
+    cudf::detail::make_host_vector_sync(
+      device_span<bitmask_type const>{col_view.null_mask() + word_index(index), 1}, stream)
+      .front();
+
   return static_cast<bool>(word & (bitmask_type{1} << intra_word_index(index)));
 }
 
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index fc244521617..9dc39f01ab3 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -60,13 +61,12 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
   // A buffer of CPU memory is allocated to hold the ColumnDeviceView
   // objects. Once filled, the CPU memory is copied to device memory
   // and then set into the d_children member pointer.
-  std::vector<char> staging_buffer(descendant_storage_bytes);
+  auto staging_buffer = detail::make_host_vector<char>(descendant_storage_bytes, stream);
 
   // Each ColumnDeviceView instance may have child objects that
   // require setting some internal device pointers before being copied
   // from CPU to device.
-  rmm::device_buffer* const descendant_storage =
-    new rmm::device_buffer(descendant_storage_bytes, stream);
+  auto const descendant_storage = new rmm::device_uvector<char>(descendant_storage_bytes, stream);
 
   auto deleter = [descendant_storage](ColumnDeviceView* v) {
     v->destroy();
@@ -77,13 +77,7 @@ create_device_view_from_view(ColumnView const& source, rmm::cuda_stream_view str
     new ColumnDeviceView(source, staging_buffer.data(), descendant_storage->data()), deleter};
 
   // copy the CPU memory with all the children into device memory
-  CUDF_CUDA_TRY(cudaMemcpyAsync(descendant_storage->data(),
-                                staging_buffer.data(),
-                                descendant_storage->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  stream.synchronize();
+  detail::cuda_memcpy<char>(*descendant_storage, staging_buffer, stream);
 
   return result;
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index d8419760120..6fc49afd7ac 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -308,7 +308,11 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 
   auto count = 0;
   for (auto& v : views) {
-    thrust::copy(rmm::exec_policy(stream), v.begin<T>(), v.end<T>(), m_view.begin<T>() + count);
+    cudaMemcpyAsync(m_view.begin<T>() + count,
+                    v.begin<T>(),
+                    v.size() * sizeof(T),
+                    cudaMemcpyDeviceToDevice,
+                    stream.value());
     count += v.size();
   }
 
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index e9443980320..3413f75357b 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -35,6 +35,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -997,7 +998,8 @@ struct packed_split_indices_and_src_buf_info {
       src_buf_info_size(
         cudf::util::round_up_safe(num_src_bufs * sizeof(src_buf_info), split_align)),
       // host-side
-      h_indices_and_source_info(indices_size + src_buf_info_size),
+      h_indices_and_source_info{
+        detail::make_host_vector<uint8_t>(indices_size + src_buf_info_size, stream)},
       h_indices{reinterpret_cast<size_type*>(h_indices_and_source_info.data())},
       h_src_buf_info{
         reinterpret_cast<src_buf_info*>(h_indices_and_source_info.data() + indices_size)}
@@ -1024,15 +1026,18 @@ struct packed_split_indices_and_src_buf_info {
       reinterpret_cast<size_type*>(reinterpret_cast<uint8_t*>(d_indices_and_source_info.data()) +
                                    indices_size + src_buf_info_size);
 
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      d_indices, h_indices, indices_size + src_buf_info_size, cudaMemcpyDefault, stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(
+      device_span<uint8_t>{static_cast<uint8_t*>(d_indices_and_source_info.data()),
+                           h_indices_and_source_info.size()},
+      h_indices_and_source_info,
+      stream);
   }
 
   size_type const indices_size;
   std::size_t const src_buf_info_size;
   std::size_t offset_stack_size;
 
-  std::vector<uint8_t> h_indices_and_source_info;
+  detail::host_vector<uint8_t> h_indices_and_source_info;
   rmm::device_buffer d_indices_and_source_info;
 
   size_type* const h_indices;
@@ -1054,27 +1059,26 @@ struct packed_partition_buf_size_and_dst_buf_info {
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
       // host-side
-      h_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size),
+      h_buf_sizes_and_dst_info{
+        detail::make_host_vector<uint8_t>(buf_sizes_size + dst_buf_info_size, stream)},
       h_buf_sizes{reinterpret_cast<std::size_t*>(h_buf_sizes_and_dst_info.data())},
       h_dst_buf_info{
-        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size)},
+        reinterpret_cast<dst_buf_info*>(h_buf_sizes_and_dst_info.data() + buf_sizes_size),
+        num_bufs,
+        h_buf_sizes_and_dst_info.get_allocator().is_device_accessible()},
       // device-side
-      d_buf_sizes_and_dst_info(buf_sizes_size + dst_buf_info_size, stream, temp_mr),
+      d_buf_sizes_and_dst_info(h_buf_sizes_and_dst_info.size(), stream, temp_mr),
       d_buf_sizes{reinterpret_cast<std::size_t*>(d_buf_sizes_and_dst_info.data())},
       // destination buffer info
-      d_dst_buf_info{reinterpret_cast<dst_buf_info*>(
-        static_cast<uint8_t*>(d_buf_sizes_and_dst_info.data()) + buf_sizes_size)}
+      d_dst_buf_info{
+        reinterpret_cast<dst_buf_info*>(d_buf_sizes_and_dst_info.data() + buf_sizes_size), num_bufs}
   {
   }
 
   void copy_to_host()
   {
     // DtoH buf sizes and col info back to the host
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_buf_sizes,
-                                  d_buf_sizes,
-                                  buf_sizes_size + dst_buf_info_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(h_buf_sizes_and_dst_info, d_buf_sizes_and_dst_info, stream);
   }
 
   rmm::cuda_stream_view const stream;
@@ -1083,13 +1087,13 @@ struct packed_partition_buf_size_and_dst_buf_info {
   std::size_t const buf_sizes_size;
   std::size_t const dst_buf_info_size;
 
-  std::vector<uint8_t> h_buf_sizes_and_dst_info;
+  detail::host_vector<uint8_t> h_buf_sizes_and_dst_info;
   std::size_t* const h_buf_sizes;
-  dst_buf_info* const h_dst_buf_info;
+  host_span<dst_buf_info> const h_dst_buf_info;
 
-  rmm::device_buffer d_buf_sizes_and_dst_info;
+  rmm::device_uvector<uint8_t> d_buf_sizes_and_dst_info;
   std::size_t* const d_buf_sizes;
-  dst_buf_info* const d_dst_buf_info;
+  device_span<dst_buf_info> const d_dst_buf_info;
 };
 
 // Packed block of memory 3:
@@ -1105,11 +1109,12 @@ struct packed_src_and_dst_pointers {
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
       // host-side
-      h_src_and_dst_buffers(src_bufs_size + dst_bufs_size),
+      h_src_and_dst_buffers{
+        detail::make_host_vector<uint8_t>(src_bufs_size + dst_bufs_size, stream)},
       h_src_bufs{reinterpret_cast<uint8_t const**>(h_src_and_dst_buffers.data())},
       h_dst_bufs{reinterpret_cast<uint8_t**>(h_src_and_dst_buffers.data() + src_bufs_size)},
       // device-side
-      d_src_and_dst_buffers{rmm::device_buffer(src_bufs_size + dst_bufs_size, stream, temp_mr)},
+      d_src_and_dst_buffers{h_src_and_dst_buffers.size(), stream, temp_mr},
       d_src_bufs{reinterpret_cast<uint8_t const**>(d_src_and_dst_buffers.data())},
       d_dst_bufs{reinterpret_cast<uint8_t**>(
         reinterpret_cast<uint8_t*>(d_src_and_dst_buffers.data()) + src_bufs_size)}
@@ -1120,18 +1125,18 @@ struct packed_src_and_dst_pointers {
 
   void copy_to_device()
   {
-    CUDF_CUDA_TRY(cudaMemcpyAsync(d_src_and_dst_buffers.data(),
-                                  h_src_and_dst_buffers.data(),
-                                  src_bufs_size + dst_bufs_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+    detail::cuda_memcpy_async<uint8_t>(
+      device_span<uint8_t>{static_cast<uint8_t*>(d_src_and_dst_buffers.data()),
+                           d_src_and_dst_buffers.size()},
+      h_src_and_dst_buffers,
+      stream);
   }
 
   rmm::cuda_stream_view const stream;
   std::size_t const src_bufs_size;
   std::size_t const dst_bufs_size;
 
-  std::vector<uint8_t> h_src_and_dst_buffers;
+  detail::host_vector<uint8_t> h_src_and_dst_buffers;
   uint8_t const** const h_src_bufs;
   uint8_t** const h_dst_bufs;
 
@@ -1204,7 +1209,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
       num_partitions, num_bufs, stream, temp_mr);
 
-  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info;
+  auto const d_dst_buf_info = partition_buf_size_and_dst_buf_info->d_dst_buf_info.begin();
   auto const d_buf_sizes    = partition_buf_size_and_dst_buf_info->d_buf_sizes;
 
   auto const split_indices_and_src_buf_info = packed_split_indices_and_src_buf_info(
@@ -1517,26 +1522,19 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
    */
   if (user_buffer_size != 0) {
     // copy the batch offsets back to host
-    std::vector<std::size_t> h_offsets(num_batches + 1);
-    {
-      rmm::device_uvector<std::size_t> offsets(h_offsets.size(), stream, temp_mr);
+    auto const h_offsets = [&] {
+      rmm::device_uvector<std::size_t> offsets(num_batches + 1, stream, temp_mr);
       auto const batch_byte_size_iter = cudf::detail::make_counting_transform_iterator(
         0, batch_byte_size_function{num_batches, d_batched_dst_buf_info.begin()});
 
-      thrust::exclusive_scan(rmm::exec_policy(stream, temp_mr),
+      thrust::exclusive_scan(rmm::exec_policy_nosync(stream, temp_mr),
                              batch_byte_size_iter,
-                             batch_byte_size_iter + num_batches + 1,
+                             batch_byte_size_iter + offsets.size(),
                              offsets.begin());
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
-                                    offsets.data(),
-                                    sizeof(std::size_t) * offsets.size(),
-                                    cudaMemcpyDefault,
-                                    stream.value()));
-
       // the next part is working on the CPU, so we want to synchronize here
-      stream.synchronize();
-    }
+      return detail::make_host_vector_sync(offsets, stream);
+    }();
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
@@ -1675,7 +1673,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
         if (bytes == 0) { return {1, 0}; }
 
         // The number of batches we want to subdivide this buffer into
-        std::size_t const num_batches = std::max(
+        std::size_t const num_batches = cuda::std::max(
           std::size_t{1}, util::round_up_unsafe(bytes, desired_batch_size) / desired_batch_size);
 
         // NOTE: leaving batch size as a separate parameter for future tuning
@@ -1698,7 +1696,7 @@ void copy_data(int num_batches_to_copy,
                int starting_batch,
                uint8_t const** d_src_bufs,
                uint8_t** d_dst_bufs,
-               rmm::device_uvector<dst_buf_info>& d_dst_buf_info,
+               device_span<dst_buf_info> d_dst_buf_info,
                uint8_t* user_buffer,
                rmm::cuda_stream_view stream)
 {
@@ -1832,15 +1830,9 @@ struct contiguous_split_state {
                           keys + num_batches_total,
                           values,
                           thrust::make_discard_iterator(),
-                          dst_valid_count_output_iterator{d_orig_dst_buf_info});
-
-    CUDF_CUDA_TRY(cudaMemcpyAsync(h_orig_dst_buf_info,
-                                  d_orig_dst_buf_info,
-                                  partition_buf_size_and_dst_buf_info->dst_buf_info_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
+                          dst_valid_count_output_iterator{d_orig_dst_buf_info.begin()});
 
-    stream.synchronize();
+    detail::cuda_memcpy<dst_buf_info>(h_orig_dst_buf_info, d_orig_dst_buf_info, stream);
 
     // not necessary for the non-chunked case, but it makes it so further calls to has_next
     // return false, just in case
@@ -1888,7 +1880,7 @@ struct contiguous_split_state {
     }
 
     auto& h_dst_buf_info  = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
-    auto cur_dst_buf_info = h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info.data();
     detail::metadata_builder mb{input.num_columns()};
 
     populate_metadata(input.begin(), input.end(), cur_dst_buf_info, mb);
@@ -1926,7 +1918,7 @@ struct contiguous_split_state {
 
     // Second pass: uses `dst_buf_info` to break down the work into 1MB batches.
     chunk_iter_state = compute_batches(num_bufs,
-                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info,
+                                       partition_buf_size_and_dst_buf_info->d_dst_buf_info.data(),
                                        partition_buf_size_and_dst_buf_info->h_buf_sizes,
                                        num_partitions,
                                        user_buffer_size,
@@ -1962,7 +1954,7 @@ struct contiguous_split_state {
     auto& h_dst_buf_info = partition_buf_size_and_dst_buf_info->h_dst_buf_info;
     auto& h_dst_bufs     = src_and_dst_pointers->h_dst_bufs;
 
-    auto cur_dst_buf_info = h_dst_buf_info;
+    auto cur_dst_buf_info = h_dst_buf_info.data();
     detail::metadata_builder mb(input.num_columns());
 
     for (std::size_t idx = 0; idx < num_partitions; idx++) {
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 59c8453cf33..4715931a7a9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/indexalator.cuh>
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index c42038026e5..4c90cd0eef5 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
@@ -32,7 +33,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -99,6 +99,8 @@ namespace {
 struct empty_column_constructor {
   column_view values;
   aggregation const& agg;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
 
   template <typename ValuesType, aggregation::Kind k>
   std::unique_ptr<cudf::column> operator()() const
@@ -108,7 +110,7 @@ struct empty_column_constructor {
 
     if constexpr (k == aggregation::Kind::COLLECT_LIST || k == aggregation::Kind::COLLECT_SET) {
       return make_lists_column(
-        0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {});
+        0, make_empty_column(type_to_id<size_type>()), empty_like(values), 0, {}, stream, mr);
     }
 
     if constexpr (k == aggregation::Kind::HISTOGRAM) {
@@ -116,7 +118,9 @@ struct empty_column_constructor {
                                make_empty_column(type_to_id<size_type>()),
                                cudf::reduction::detail::make_empty_histogram_like(values),
                                0,
-                               {});
+                               {},
+                               stream,
+                               mr);
     }
     if constexpr (k == aggregation::Kind::MERGE_HISTOGRAM) { return empty_like(values); }
 
@@ -140,31 +144,41 @@ struct empty_column_constructor {
       return empty_like(values);
     }
 
+    if constexpr (k == aggregation::Kind::HOST_UDF) {
+      auto const& udf_ptr = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+      return std::get<std::unique_ptr<column>>(udf_ptr->get_empty_output(std::nullopt, stream, mr));
+    }
+
     return make_empty_column(target_type(values.type(), k));
   }
 };
 
 /// Make an empty table with appropriate types for requested aggs
 template <typename RequestType>
-auto empty_results(host_span<RequestType const> requests)
+auto empty_results(host_span<RequestType const> requests,
+                   rmm::cuda_stream_view stream,
+                   rmm::device_async_resource_ref mr)
 {
   std::vector<aggregation_result> empty_results;
 
-  std::transform(
-    requests.begin(), requests.end(), std::back_inserter(empty_results), [](auto const& request) {
-      std::vector<std::unique_ptr<column>> results;
-
-      std::transform(
-        request.aggregations.begin(),
-        request.aggregations.end(),
-        std::back_inserter(results),
-        [&request](auto const& agg) {
-          return cudf::detail::dispatch_type_and_aggregation(
-            request.values.type(), agg->kind, empty_column_constructor{request.values, *agg});
-        });
-
-      return aggregation_result{std::move(results)};
-    });
+  std::transform(requests.begin(),
+                 requests.end(),
+                 std::back_inserter(empty_results),
+                 [stream, mr](auto const& request) {
+                   std::vector<std::unique_ptr<column>> results;
+
+                   std::transform(request.aggregations.begin(),
+                                  request.aggregations.end(),
+                                  std::back_inserter(results),
+                                  [&request, stream, mr](auto const& agg) {
+                                    return cudf::detail::dispatch_type_and_aggregation(
+                                      request.values.type(),
+                                      agg->kind,
+                                      empty_column_constructor{request.values, *agg, stream, mr});
+                                  });
+
+                   return aggregation_result{std::move(results)};
+                 });
 
   return empty_results;
 }
@@ -206,7 +220,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return {empty_like(_keys), empty_results(requests)}; }
+  if (_keys.num_rows() == 0) { return {empty_like(_keys), empty_results(requests, stream, mr)}; }
 
   return dispatch_aggregation(requests, stream, mr);
 }
@@ -226,7 +240,9 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 
   verify_valid_requests(requests);
 
-  if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
+  if (_keys.num_rows() == 0) {
+    return std::pair(empty_like(_keys), empty_results(requests, stream, mr));
+  }
 
   return sort_scan(requests, stream, mr);
 }
diff --git a/cpp/src/groupby/hash/compute_aggregations.cuh b/cpp/src/groupby/hash/compute_aggregations.cuh
index e8b29a0e7a8..9c9a4c97bff 100644
--- a/cpp/src/groupby/hash/compute_aggregations.cuh
+++ b/cpp/src/groupby/hash/compute_aggregations.cuh
@@ -60,7 +60,7 @@ rmm::device_uvector<cudf::size_type> compute_aggregations(
   rmm::cuda_stream_view stream)
 {
   // flatten the aggs to a table that can be operated on by aggregate_row
-  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests);
+  auto [flattened_values, agg_kinds, aggs] = flatten_single_pass_aggs(requests, stream);
   auto const d_agg_kinds                   = cudf::detail::make_device_uvector_async(
     agg_kinds, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cu b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
index 6025686953e..d2830f7d905 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<global_
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
index 00db149c6d9..671ee2ea31f 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.cuh
@@ -25,6 +25,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -44,7 +45,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
index 0777b9ffd93..437823a3fea 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs.hpp
@@ -19,6 +19,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -34,7 +35,7 @@ rmm::device_uvector<cudf::size_type> compute_global_memory_aggs(
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   SetType& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
index 209e2b7f20a..7cb3f8f190b 100644
--- a/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
+++ b/cpp/src/groupby/hash/compute_global_memory_aggs_null.cu
@@ -24,7 +24,7 @@ template rmm::device_uvector<cudf::size_type> compute_global_memory_aggs<nullabl
   bitmask_type const* row_bitmask,
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> const& agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   nullable_global_set_t& global_set,
   std::vector<std::unique_ptr<aggregation>>& aggregations,
   cudf::detail::result_cache* sparse_results,
diff --git a/cpp/src/groupby/hash/compute_groupby.cu b/cpp/src/groupby/hash/compute_groupby.cu
index e1dbf2a3d9e..9648d942513 100644
--- a/cpp/src/groupby/hash/compute_groupby.cu
+++ b/cpp/src/groupby/hash/compute_groupby.cu
@@ -61,7 +61,7 @@ std::unique_ptr<table> compute_groupby(table_view const& keys,
     d_row_equal,
     probing_scheme_t{d_row_hash},
     cuco::thread_scope_device,
-    cuco::storage<GROUPBY_WINDOW_SIZE>{},
+    cuco::storage<GROUPBY_BUCKET_SIZE>{},
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value()};
 
diff --git a/cpp/src/groupby/hash/compute_mapping_indices.cuh b/cpp/src/groupby/hash/compute_mapping_indices.cuh
index d353830780f..f86a93109be 100644
--- a/cpp/src/groupby/hash/compute_mapping_indices.cuh
+++ b/cpp/src/groupby/hash/compute_mapping_indices.cuh
@@ -106,15 +106,15 @@ CUDF_KERNEL void mapping_indices_kernel(cudf::size_type num_input_rows,
   __shared__ cudf::size_type shared_set_indices[GROUPBY_SHM_MAX_ELEMENTS];
 
   // Shared set initialization
-  __shared__ cuco::window<cudf::size_type, GROUPBY_WINDOW_SIZE> windows[window_extent.value()];
+  __shared__ cuco::bucket<cudf::size_type, GROUPBY_BUCKET_SIZE> buckets[bucket_extent.value()];
 
   auto raw_set = cuco::static_set_ref{
     cuco::empty_key<cudf::size_type>{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
     global_set.key_eq(),
     probing_scheme_t{global_set.hash_function()},
     cuco::thread_scope_block,
-    cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, decltype(window_extent)>{
-      window_extent, windows}};
+    cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, decltype(bucket_extent)>{
+      bucket_extent, buckets}};
   auto shared_set = raw_set.rebind_operators(cuco::insert_and_find);
 
   auto const block = cooperative_groups::this_thread_block();
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.cu b/cpp/src/groupby/hash/create_sparse_results_table.cu
index bc32e306b3f..a835736235c 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.cu
+++ b/cpp/src/groupby/hash/create_sparse_results_table.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/aggregation/aggregation.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -48,7 +49,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        host_span<cudf::aggregation::Kind const> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -107,7 +108,7 @@ template void extract_populated_keys<nullable_global_set_t>(
 template cudf::table create_sparse_results_table<global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
@@ -116,7 +117,7 @@ template cudf::table create_sparse_results_table<global_set_t>(
 template cudf::table create_sparse_results_table<nullable_global_set_t>(
   cudf::table_view const& flattened_values,
   cudf::aggregation::Kind const* d_agg_kinds,
-  std::vector<cudf::aggregation::Kind> agg_kinds,
+  host_span<cudf::aggregation::Kind const> agg_kinds,
   bool direct_aggregations,
   nullable_global_set_t const& global_set,
   rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/create_sparse_results_table.hpp b/cpp/src/groupby/hash/create_sparse_results_table.hpp
index 8155ce852e0..4e2fa81bdb7 100644
--- a/cpp/src/groupby/hash/create_sparse_results_table.hpp
+++ b/cpp/src/groupby/hash/create_sparse_results_table.hpp
@@ -20,12 +20,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 
-#include <vector>
-
 namespace cudf::groupby::detail::hash {
 /**
  * @brief Computes and returns a device vector containing all populated keys in
@@ -47,7 +46,7 @@ void extract_populated_keys(SetType const& key_set,
 template <typename GlobalSetType>
 cudf::table create_sparse_results_table(cudf::table_view const& flattened_values,
                                         cudf::aggregation::Kind const* d_agg_kinds,
-                                        std::vector<cudf::aggregation::Kind> agg_kinds,
+                                        host_span<cudf::aggregation::Kind const> agg_kinds,
                                         bool direct_aggregations,
                                         GlobalSetType const& global_set,
                                         rmm::device_uvector<cudf::size_type>& populated_keys,
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
index b2048a9fbb8..a533f7a6448 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
@@ -102,12 +103,15 @@ class groupby_simple_aggregations_collector final
 };
 
 // flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests)
+std::tuple<table_view,
+           cudf::detail::host_vector<aggregation::Kind>,
+           std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests,
+                         rmm::cuda_stream_view stream)
 {
   std::vector<column_view> columns;
   std::vector<std::unique_ptr<aggregation>> aggs;
-  std::vector<aggregation::Kind> agg_kinds;
+  auto agg_kinds = cudf::detail::make_empty_host_vector<aggregation::Kind>(requests.size(), stream);
 
   for (auto const& request : requests) {
     auto const& agg_v = request.aggregations;
diff --git a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
index dfad51f27d4..e3c17ca972c 100644
--- a/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
+++ b/cpp/src/groupby/hash/flatten_single_pass_aggs.hpp
@@ -26,7 +26,10 @@
 namespace cudf::groupby::detail::hash {
 
 // flatten aggs to filter in single pass aggs
-std::tuple<table_view, std::vector<aggregation::Kind>, std::vector<std::unique_ptr<aggregation>>>
-flatten_single_pass_aggs(host_span<aggregation_request const> requests);
+std::tuple<table_view,
+           cudf::detail::host_vector<aggregation::Kind>,
+           std::vector<std::unique_ptr<aggregation>>>
+flatten_single_pass_aggs(host_span<aggregation_request const> requests,
+                         rmm::cuda_stream_view stream);
 
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
index 37a61c1a22c..b71e20938d6 100644
--- a/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
+++ b/cpp/src/groupby/hash/hash_compound_agg_finalizer.cu
@@ -170,7 +170,8 @@ void hash_compound_agg_finalizer<SetType>::visit(cudf::detail::var_aggregation c
     cudf::detail::target_type(result_type, agg.kind), col.size(), mask_state::ALL_NULL, stream);
   auto var_result_view = mutable_column_device_view::create(var_result->mutable_view(), stream);
   mutable_table_view var_table_view{{var_result->mutable_view()}};
-  cudf::detail::initialize_with_identity(var_table_view, {agg.kind}, stream);
+  cudf::detail::initialize_with_identity(
+    var_table_view, host_span<cudf::aggregation::Kind const>(&agg.kind, 1), stream);
 
   thrust::for_each_n(
     rmm::exec_policy_nosync(stream),
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index f950e03e0fb..92925e11bac 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -27,7 +27,7 @@ namespace cudf::groupby::detail::hash {
 CUDF_HOST_DEVICE auto constexpr GROUPBY_CG_SIZE = 1;
 
 /// Number of slots per thread
-CUDF_HOST_DEVICE auto constexpr GROUPBY_WINDOW_SIZE = 1;
+CUDF_HOST_DEVICE auto constexpr GROUPBY_BUCKET_SIZE = 1;
 
 /// Thread block size
 CUDF_HOST_DEVICE auto constexpr GROUPBY_BLOCK_SIZE = 128;
@@ -48,9 +48,9 @@ using shmem_extent_t =
   cuco::extent<cudf::size_type,
                static_cast<cudf::size_type>(static_cast<double>(GROUPBY_SHM_MAX_ELEMENTS) * 1.43)>;
 
-/// Number of windows needed by each shared memory hash set
-CUDF_HOST_DEVICE auto constexpr window_extent =
-  cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
+/// Number of buckets needed by each shared memory hash set
+CUDF_HOST_DEVICE auto constexpr bucket_extent =
+  cuco::make_bucket_extent<GROUPBY_CG_SIZE, GROUPBY_BUCKET_SIZE>(shmem_extent_t{});
 
 using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
@@ -75,7 +75,7 @@ using global_set_t = cuco::static_set<cudf::size_type,
                                       row_comparator_t,
                                       probing_scheme_t,
                                       cudf::detail::cuco_allocator<char>,
-                                      cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                      cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                cuco::extent<int64_t>,
@@ -83,7 +83,7 @@ using nullable_global_set_t = cuco::static_set<cudf::size_type,
                                                nullable_row_comparator_t,
                                                probing_scheme_t,
                                                cudf::detail::cuco_allocator<char>,
-                                               cuco::storage<GROUPBY_WINDOW_SIZE>>;
+                                               cuco::storage<GROUPBY_BUCKET_SIZE>>;
 
 template <typename Op>
 using hash_set_ref_t = cuco::static_set_ref<
@@ -91,7 +91,7 @@ using hash_set_ref_t = cuco::static_set_ref<
   cuda::thread_scope_device,
   row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 
 template <typename Op>
@@ -100,6 +100,6 @@ using nullable_hash_set_ref_t = cuco::static_set_ref<
   cuda::thread_scope_device,
   nullable_row_comparator_t,
   probing_scheme_t,
-  cuco::aow_storage_ref<cudf::size_type, GROUPBY_WINDOW_SIZE, cuco::window_extent<int64_t>>,
+  cuco::bucket_storage_ref<cudf::size_type, GROUPBY_BUCKET_SIZE, cuco::bucket_extent<int64_t>>,
   Op>;
 }  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 7a8a1883ed4..6480070e85a 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -19,6 +19,7 @@
 #include "groupby/sort/group_reductions.hpp"
 
 #include <cudf/aggregation.hpp>
+#include <cudf/aggregation/host_udf.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -208,10 +209,7 @@ void aggregate_result_functor::operator()<aggregation::MIN>(aggregation const& a
       operator()<aggregation::ARGMIN>(*argmin_agg);
       column_view const argmin_result = cache.get_result(values, *argmin_agg);
 
-      // We make a view of ARGMIN result without a null mask and gather using
-      // this mask. The values in data buffer of ARGMIN result corresponding
-      // to null values was initialized to ARGMIN_SENTINEL which is an out of
-      // bounds index value and causes the gathered value to be null.
+      // Compute the ARGMIN result without the null mask in the gather map.
       column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmin_result.size(),
@@ -250,10 +248,7 @@ void aggregate_result_functor::operator()<aggregation::MAX>(aggregation const& a
       operator()<aggregation::ARGMAX>(*argmax_agg);
       column_view const argmax_result = cache.get_result(values, *argmax_agg);
 
-      // We make a view of ARGMAX result without a null mask and gather using
-      // this mask. The values in data buffer of ARGMAX result corresponding
-      // to null values was initialized to ARGMAX_SENTINEL which is an out of
-      // bounds index value and causes the gathered value to be null.
+      // Compute the ARGMAX result without the null mask in the gather map.
       column_view const null_removed_map(
         data_type(type_to_id<size_type>()),
         argmax_result.size(),
@@ -795,6 +790,65 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
                                                               mr));
 }
 
+template <>
+void aggregate_result_functor::operator()<aggregation::HOST_UDF>(aggregation const& agg)
+{
+  if (cache.has_result(values, agg)) { return; }
+
+  auto const& udf_ptr   = dynamic_cast<cudf::detail::host_udf_aggregation const&>(agg).udf_ptr;
+  auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t {
+    if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; }
+    // Empty attribute set means everything.
+    return {host_udf_base::groupby_data_attribute::INPUT_VALUES,
+            host_udf_base::groupby_data_attribute::GROUPED_VALUES,
+            host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
+            host_udf_base::groupby_data_attribute::NUM_GROUPS,
+            host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
+            host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  }();
+
+  // Do not cache udf_input, as the actual input data may change from run to run.
+  host_udf_base::input_map_t udf_input;
+  for (auto const& attr : data_attrs) {
+    CUDF_EXPECTS(std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value) ||
+                   std::holds_alternative<std::unique_ptr<aggregation>>(attr.value),
+                 "Invalid input data attribute for HOST_UDF groupby aggregation.");
+    if (std::holds_alternative<host_udf_base::groupby_data_attribute>(attr.value)) {
+      switch (std::get<host_udf_base::groupby_data_attribute>(attr.value)) {
+        case host_udf_base::groupby_data_attribute::INPUT_VALUES:
+          udf_input.emplace(attr, values);
+          break;
+        case host_udf_base::groupby_data_attribute::GROUPED_VALUES:
+          udf_input.emplace(attr, get_grouped_values());
+          break;
+        case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES:
+          udf_input.emplace(attr, get_sorted_values());
+          break;
+        case host_udf_base::groupby_data_attribute::NUM_GROUPS:
+          udf_input.emplace(attr, helper.num_groups(stream));
+          break;
+        case host_udf_base::groupby_data_attribute::GROUP_OFFSETS:
+          udf_input.emplace(attr, helper.group_offsets(stream));
+          break;
+        case host_udf_base::groupby_data_attribute::GROUP_LABELS:
+          udf_input.emplace(attr, helper.group_labels(stream));
+          break;
+        default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation.");
+      }
+    } else {  // data is result from another aggregation
+      auto other_agg = std::get<std::unique_ptr<aggregation>>(attr.value)->clone();
+      cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg);
+      auto result = cache.get_result(values, *other_agg);
+      udf_input.emplace(std::move(other_agg), std::move(result));
+    }
+  }
+
+  auto output = (*udf_ptr)(udf_input, stream, mr);
+  CUDF_EXPECTS(std::holds_alternative<std::unique_ptr<column>>(output),
+               "Invalid output type from HOST_UDF groupby aggregation.");
+  cache.add_result(values, agg, std::get<std::unique_ptr<column>>(std::move(output)));
+}
+
 }  // namespace detail
 
 // Sort-based groupby
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index 7dce341130e..329c7c4eb32 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -42,22 +42,21 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                  stream,
                                  mr);
 
-  // The functor returns the index of maximum in the sorted values.
-  // We need the index of maximum in the original unsorted values.
-  // So use indices to gather the sort order used to sort `values`.
-  // Gather map cannot be null so we make a view with the mask removed.
-  // The values in data buffer of indices corresponding to null values was
-  // initialized to ARGMAX_SENTINEL. Using gather_if.
-  // This can't use gather because nulls in gathered column will not store ARGMAX_SENTINEL.
-  auto indices_view = indices->mutable_view();
-  thrust::gather_if(rmm::exec_policy(stream),
-                    indices_view.begin<size_type>(),    // map first
-                    indices_view.end<size_type>(),      // map last
-                    indices_view.begin<size_type>(),    // stencil
-                    key_sort_order.begin<size_type>(),  // input
-                    indices_view.begin<size_type>(),    // result
-                    [] __device__(auto i) { return (i != cudf::detail::ARGMAX_SENTINEL); });
-  return indices;
+  // The functor returns the indices of maximums based on the sorted keys.
+  // We need the indices of maximums from the original unsorted keys
+  // so we use these indices and the key_sort_order to map to the correct indices.
+  // We do not use cudf::gather since we can move the null-mask separately.
+  auto indices_view = indices->view();
+  auto output       = rmm::device_uvector<size_type>(indices_view.size(), stream, mr);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 indices_view.begin<size_type>(),    // map first
+                 indices_view.end<size_type>(),      // map last
+                 key_sort_order.begin<size_type>(),  // input
+                 output.data()                       // result (must not overlap map)
+  );
+  auto null_count = indices_view.null_count();
+  auto null_mask  = indices->release().null_mask.release();
+  return std::make_unique<column>(std::move(output), std::move(*null_mask), null_count);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index c4bed330b9f..dbfc375fc20 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <thrust/gather.h>
 
@@ -42,22 +43,21 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                  stream,
                                  mr);
 
-  // The functor returns the index of minimum in the sorted values.
-  // We need the index of minimum in the original unsorted values.
-  // So use indices to gather the sort order used to sort `values`.
-  // The values in data buffer of indices corresponding to null values was
-  // initialized to ARGMIN_SENTINEL. Using gather_if.
-  // This can't use gather because nulls in gathered column will not store ARGMIN_SENTINEL.
-  auto indices_view = indices->mutable_view();
-  thrust::gather_if(rmm::exec_policy(stream),
-                    indices_view.begin<size_type>(),    // map first
-                    indices_view.end<size_type>(),      // map last
-                    indices_view.begin<size_type>(),    // stencil
-                    key_sort_order.begin<size_type>(),  // input
-                    indices_view.begin<size_type>(),    // result
-                    [] __device__(auto i) { return (i != cudf::detail::ARGMIN_SENTINEL); });
-
-  return indices;
+  // The functor returns the indices of minimums based on the sorted keys.
+  // We need the indices of minimums from the original unsorted keys
+  // so we use these and the key_sort_order to map to the correct indices.
+  // We do not use cudf::gather since we can move the null-mask separately.
+  auto indices_view = indices->view();
+  auto output       = rmm::device_uvector<size_type>(indices_view.size(), stream, mr);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 indices_view.begin<size_type>(),    // map first
+                 indices_view.end<size_type>(),      // map last
+                 key_sort_order.begin<size_type>(),  // input
+                 output.data()                       // result (must not overlap map)
+  );
+  auto null_count = indices_view.null_count();
+  auto null_mask  = indices->release().null_mask.release();
+  return std::make_unique<column>(std::move(output), std::move(*null_mask), null_count);
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 65bd5ac408f..583357d9090 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/pair.h>
@@ -185,7 +186,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
     group_labels,
     group_offsets,
     [] __device__(bool unequal, auto row_index_in_group) {
-      return unequal ? row_index_in_group + 1 : std::numeric_limits<size_type>::max();
+      return unequal ? row_index_in_group + 1 : cuda::std::numeric_limits<size_type>::max();
     },
     DeviceMin{},
     has_nested_nulls(table_view{{grouped_values}}),
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 86835ea8a67..5082ad01327 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -107,7 +107,10 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
     if (values.is_empty()) { return result; }
 
     auto result_table = mutable_table_view({*result});
-    cudf::detail::initialize_with_identity(result_table, {K}, stream);
+    // Need an address of the aggregation kind to pass to the span
+    auto const kind = K;
+    cudf::detail::initialize_with_identity(
+      result_table, host_span<aggregation::Kind const>(&kind, 1), stream);
 
     auto result_view = mutable_column_device_view::create(result->mutable_view(), stream);
     auto values_view = column_device_view::create(values, stream);
diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp
new file mode 100644
index 00000000000..0da47e17f48
--- /dev/null
+++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/utilities/visitor_overload.hpp>
+
+namespace cudf {
+
+host_udf_base::data_attribute::data_attribute(data_attribute const& other)
+  : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; },
+                                                    [](std::unique_ptr<aggregation> const& val) {
+                                                      return value_type{val->clone()};
+                                                    }},
+                     other.value)}
+{
+}
+
+std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const
+{
+  auto const hash_value =
+    std::visit(cudf::detail::visitor_overload{
+                 [](auto const& val) { return std::hash<int>{}(static_cast<int>(val)); },
+                 [](std::unique_ptr<aggregation> const& val) { return val->do_hash(); }},
+               attr.value);
+  return std::hash<std::size_t>{}(attr.value.index()) ^ hash_value;
+}
+
+bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs,
+                                                         data_attribute const& rhs) const
+{
+  auto const& lhs_val = lhs.value;
+  auto const& rhs_val = rhs.value;
+  if (lhs_val.index() != rhs_val.index()) { return false; }
+  return std::visit(
+    cudf::detail::visitor_overload{
+      [](auto const& lhs_val, auto const& rhs_val) {
+        if constexpr (std::is_same_v<decltype(lhs_val), decltype(rhs_val)>) {
+          return lhs_val == rhs_val;
+        } else {
+          return false;
+        }
+      },
+      [](std::unique_ptr<aggregation> const& lhs_val, std::unique_ptr<aggregation> const& rhs_val) {
+        return lhs_val->is_equal(*rhs_val);
+      }},
+    lhs_val,
+    rhs_val);
+}
+
+namespace detail {
+
+host_udf_aggregation::host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
+  : aggregation{HOST_UDF}, udf_ptr{std::move(udf_ptr_)}
+{
+  CUDF_EXPECTS(udf_ptr != nullptr, "Invalid host_udf_base instance.");
+}
+
+host_udf_aggregation::~host_udf_aggregation() = default;
+
+bool host_udf_aggregation::is_equal(aggregation const& _other) const
+{
+  if (!this->aggregation::is_equal(_other)) { return false; }
+  auto const& other = dynamic_cast<host_udf_aggregation const&>(_other);
+  return udf_ptr->is_equal(*other.udf_ptr);
+}
+
+size_t host_udf_aggregation::do_hash() const
+{
+  return this->aggregation::do_hash() ^ udf_ptr->do_hash();
+}
+
+std::unique_ptr<aggregation> host_udf_aggregation::clone() const
+{
+  return std::make_unique<host_udf_aggregation>(udf_ptr->clone());
+}
+
+}  // namespace detail
+
+template <typename Base>
+std::unique_ptr<Base> make_host_udf_aggregation(std::unique_ptr<host_udf_base> udf_ptr_)
+{
+  return std::make_unique<detail::host_udf_aggregation>(std::move(udf_ptr_));
+}
+template CUDF_EXPORT std::unique_ptr<aggregation> make_host_udf_aggregation<aggregation>(
+  std::unique_ptr<host_udf_base>);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+  make_host_udf_aggregation<groupby_aggregation>(std::unique_ptr<host_udf_base>);
+
+}  // namespace cudf
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 43df7f325ac..ccdd097fa9c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -25,6 +25,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/std/array>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 
 namespace cudf {
@@ -83,7 +84,8 @@ class murmur_device_row_hasher {
                                           hash_value_type const seed) const noexcept
     {
       if (check_nulls && col.is_null(row_index)) {
-        return {std::numeric_limits<uint64_t>::max(), std::numeric_limits<uint64_t>::max()};
+        return {cuda::std::numeric_limits<uint64_t>::max(),
+                cuda::std::numeric_limits<uint64_t>::max()};
       }
       auto const hasher = MurmurHash3_x64_128<T>{seed[0]};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index eb002cf9c6f..52f31667ff0 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -37,7 +38,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <limits>
 #include <memory>
 #include <type_traits>
 #include <utility>
@@ -252,7 +252,7 @@ struct HasherDispatcher {
   {
     Element const& key = input_col.element<Element>(row_index);
     if (isnan(key)) {
-      Element nan = std::numeric_limits<Element>::quiet_NaN();
+      Element nan = cuda::std::numeric_limits<Element>::quiet_NaN();
       hasher->process_fixed_width(nan);
     } else if (key == Element{0.0}) {
       hasher->process_fixed_width(Element{0.0});
diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu
new file mode 100644
index 00000000000..40503f7f911
--- /dev/null
+++ b/cpp/src/hash/xxhash_32.cu
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_32.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/tabulate.h>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type const row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return cuda::std::numeric_limits<hash_value_type>::max();
+      }
+      auto const hasher = XXHash_32<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type const,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for XXHash_32");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
+                                    input.num_rows(),
+                                    mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable   = has_nulls(input);
+  auto const input_view = table_device_view::create(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   device_row_hasher(nullable, *input_view, seed));
+
+  return output;
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::xxhash_32(input, seed, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index bdbe13b1ffb..5e74148ceaf 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/limits>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -72,7 +73,7 @@ class device_row_hasher {
                                           hash_value_type const _seed) const noexcept
     {
       if (_check_nulls && col.is_null(row_index)) {
-        return std::numeric_limits<hash_value_type>::max();
+        return cuda::std::numeric_limits<hash_value_type>::max();
       }
       auto const hasher = XXHash_64<T>{_seed};
       return hasher(col.element<T>(row_index));
diff --git a/cpp/src/io/avro/avro_common.hpp b/cpp/src/io/avro/avro_common.hpp
index 9bf66369d6a..4c05d78292b 100644
--- a/cpp/src/io/avro/avro_common.hpp
+++ b/cpp/src/io/avro/avro_common.hpp
@@ -142,7 +142,7 @@ enum logicaltype_kind_e {
  *
  * @return true if the logical type is supported, false otherwise.
  */
-inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
+CUDF_HOST_DEVICE inline constexpr bool is_supported_logical_type(logicaltype_kind_e logical_kind)
 {
   switch (logical_kind) {
     case logicaltype_date: return true;
diff --git a/cpp/src/io/comp/common.hpp b/cpp/src/io/comp/common.hpp
new file mode 100644
index 00000000000..a81ac60e03a
--- /dev/null
+++ b/cpp/src/io/comp/common.hpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+namespace cudf::io::detail {
+
+/**
+ * @brief The size used for padding a data buffer's size to a multiple of the padding.
+ *
+ * Padding is necessary for input/output buffers of several compression/decompression kernels
+ * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require
+ * padding to the buffers so that the pointers can shift along the address space to satisfy their
+ * alignment requirement.
+ *
+ * In the meantime, it is not entirely clear why such padding is needed. We need to further
+ * investigate and implement a better fix rather than just padding the buffer.
+ * See https://github.com/rapidsai/cudf/issues/13605.
+ */
+constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp
index 2dda2287e09..26535bed43b 100644
--- a/cpp/src/io/comp/comp.cpp
+++ b/cpp/src/io/comp/comp.cpp
@@ -87,15 +87,14 @@ std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
   outputs[0] = d_dst;
   outputs.host_to_device_async(stream);
 
-  cudf::detail::hostdevice_vector<cudf::io::compression_result> hd_status(1, stream);
+  cudf::detail::hostdevice_vector<compression_result> hd_status(1, stream);
   hd_status[0] = {};
   hd_status.host_to_device_async(stream);
 
   nvcomp::batched_compress(nvcomp::compression_type::SNAPPY, inputs, outputs, hd_status, stream);
 
   hd_status.device_to_host_sync(stream);
-  CUDF_EXPECTS(hd_status[0].status == cudf::io::compression_status::SUCCESS,
-               "snappy compression failed");
+  CUDF_EXPECTS(hd_status[0].status == compression_status::SUCCESS, "snappy compression failed");
   return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
 }
 
diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp
index 652abbbeda6..e16f26e1f06 100644
--- a/cpp/src/io/comp/comp.hpp
+++ b/cpp/src/io/comp/comp.hpp
@@ -16,16 +16,34 @@
 
 #pragma once
 
+#include "common.hpp"
+
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <memory>
-#include <string>
 #include <vector>
 
 namespace CUDF_EXPORT cudf {
 namespace io::detail {
 
+/**
+ * @brief Status of a compression/decompression operation.
+ */
+enum class compression_status : uint8_t {
+  SUCCESS,          ///< Successful, output is valid
+  FAILURE,          ///< Failed, output is invalid (e.g. input is unsupported in some way)
+  SKIPPED,          ///< Operation skipped (if conversion, uncompressed data can be used)
+  OUTPUT_OVERFLOW,  ///< Output buffer is too small; operation can succeed with larger output
+};
+
+/**
+ * @brief Descriptor of compression/decompression result.
+ */
+struct compression_result {
+  uint64_t bytes_written;
+  compression_status status;
+};
+
 /**
  * @brief Compresses a system memory buffer.
  *
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 72649dbe427..151f72d262e 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -63,8 +63,8 @@ THE SOFTWARE.
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
+namespace cudf::io::detail {
+
 constexpr uint32_t huffman_lookup_table_width      = 8;
 constexpr int8_t brotli_code_length_codes          = 18;
 constexpr uint32_t brotli_num_distance_short_codes = 16;
@@ -2020,7 +2020,6 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2)
     results[block_id].status =
       (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE;
     // Return ext heap used by last block (statistics)
-    results[block_id].reserved = s->fb_size;
   }
 }
 
@@ -2115,5 +2114,4 @@ void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
 #endif
 }
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/gpuinflate.cu b/cpp/src/io/comp/gpuinflate.cu
index 090ea1430b5..6e5ce4ce6c3 100644
--- a/cpp/src/io/comp/gpuinflate.cu
+++ b/cpp/src/io/comp/gpuinflate.cu
@@ -49,8 +49,7 @@ Mark Adler    madler@alumni.caltech.edu
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
+namespace cudf::io::detail {
 
 constexpr int max_bits    = 15;   // maximum bits in a code
 constexpr int max_l_codes = 286;  // maximum number of literal/length codes
@@ -1139,7 +1138,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         default: return compression_status::FAILURE;
       }
     }();
-    results[z].reserved = (int)(state->end - state->cur);  // Here mainly for debug purposes
   }
 }
 
@@ -1224,5 +1222,4 @@ void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const>
   }
 }
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 8bfca2b30df..4b09bd5a84c 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "io/comp/comp.hpp"
+
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
@@ -24,44 +26,10 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-
-/**
- * @brief Status of a compression/decompression operation.
- */
-enum class compression_status : uint8_t {
-  SUCCESS,          ///< Successful, output is valid
-  FAILURE,          ///< Failed, output is invalid (e.g. input is unsupported in some way)
-  SKIPPED,          ///< Operation skipped (if conversion, uncompressed data can be used)
-  OUTPUT_OVERFLOW,  ///< Output buffer is too small; operation can succeed with larger output
-};
-
-/**
- * @brief Descriptor of compression/decompression result.
- */
-struct compression_result {
-  uint64_t bytes_written;
-  compression_status status;
-  uint32_t reserved;
-};
+namespace cudf::io::detail {
 
 enum class gzip_header_included { NO, YES };
 
-/**
- * @brief The value used for padding a data buffer such that its size will be multiple of it.
- *
- * Padding is necessary for input/output buffers of several compression/decompression kernels
- * (inflate_kernel and nvcomp snappy). Such kernels operate on aligned data pointers, which require
- * padding to the buffers so that the pointers can shift along the address space to satisfy their
- * alignment requirement.
- *
- * In the meantime, it is not entirely clear why such padding is needed. We need to further
- * investigate and implement a better fix rather than just padding the buffer.
- * See https://github.com/rapidsai/cudf/issues/13605.
- */
-constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
-
 /**
  * @brief Interface for decompressing GZIP-compressed data
  *
@@ -169,5 +137,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
   device_span<compression_result const> results,
   rmm::cuda_stream_view stream);
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/io_uncomp.hpp b/cpp/src/io/comp/io_uncomp.hpp
index ca722a9b7ee..711a1c3274f 100644
--- a/cpp/src/io/comp/io_uncomp.hpp
+++ b/cpp/src/io/comp/io_uncomp.hpp
@@ -16,15 +16,13 @@
 
 #pragma once
 
+#include "common.hpp"
+
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <memory>
-#include <string>
 #include <vector>
 
-using cudf::host_span;
-
 namespace CUDF_EXPORT cudf {
 namespace io::detail {
 
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 9d3cf75a13f..ac81dd421fa 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@
 
 #include "nvcomp_adapter.cuh"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
@@ -30,7 +30,7 @@
 
 #include <mutex>
 
-namespace cudf::io::nvcomp {
+namespace cudf::io::detail::nvcomp {
 namespace {
 
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
@@ -416,11 +416,11 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   memo_map_lock.unlock();
 
   if (reason.has_value()) {
-    CUDF_LOG_INFO("nvCOMP is disabled for {} compression; reason: {}",
+    CUDF_LOG_INFO("nvCOMP is disabled for %s compression; reason: %s",
                   compression_type_name(compression),
                   reason.value());
   } else {
-    CUDF_LOG_INFO("nvCOMP is enabled for {} compression", compression_type_name(compression));
+    CUDF_LOG_INFO("nvCOMP is enabled for %s compression", compression_type_name(compression));
   }
 
   return reason;
@@ -445,11 +445,11 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
   memo_map_lock.unlock();
 
   if (reason.has_value()) {
-    CUDF_LOG_INFO("nvCOMP is disabled for {} decompression; reason: {}",
+    CUDF_LOG_INFO("nvCOMP is disabled for %s decompression; reason: %s",
                   compression_type_name(compression),
                   reason.value());
   } else {
-    CUDF_LOG_INFO("nvCOMP is enabled for {} decompression", compression_type_name(compression));
+    CUDF_LOG_INFO("nvCOMP is enabled for %s decompression", compression_type_name(compression));
   }
 
   return reason;
@@ -478,4 +478,4 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
   }
 }
 
-}  // namespace cudf::io::nvcomp
+}  // namespace cudf::io::detail::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index 794d452ebf2..cf5996dfd93 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-namespace cudf::io::nvcomp {
+namespace cudf::io::detail::nvcomp {
 
 batched_args create_batched_nvcomp_args(device_span<device_span<uint8_t const> const> inputs,
                                         device_span<device_span<uint8_t> const> outputs,
@@ -127,4 +127,4 @@ std::pair<size_t, size_t> max_chunk_and_total_input_size(device_span<size_t cons
   return {max, sum};
 }
 
-}  // namespace cudf::io::nvcomp
+}  // namespace cudf::io::detail::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.cuh b/cpp/src/io/comp/nvcomp_adapter.cuh
index 4a7b6463fa0..1b303d88915 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cuh
+++ b/cpp/src/io/comp/nvcomp_adapter.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "gpuinflate.hpp"
+#include "comp.hpp"
 
 #include <cudf/utilities/span.hpp>
 
@@ -27,7 +27,7 @@
 
 #include <optional>
 
-namespace cudf::io::nvcomp {
+namespace cudf::io::detail::nvcomp {
 
 struct batched_args {
   rmm::device_uvector<void const*> input_data_ptrs;
@@ -76,4 +76,4 @@ void skip_unsupported_inputs(device_span<size_t> input_sizes,
 std::pair<size_t, size_t> max_chunk_and_total_input_size(device_span<size_t const> input_sizes,
                                                          rmm::cuda_stream_view stream);
 
-}  // namespace cudf::io::nvcomp
+}  // namespace cudf::io::detail::nvcomp
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 2e1cda2d6b7..5c402523168 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "gpuinflate.hpp"
+#include "io/comp/comp.hpp"
 
 #include <cudf/io/nvcomp_adapter.hpp>
 #include <cudf/utilities/span.hpp>
@@ -25,7 +25,7 @@
 
 #include <optional>
 
-namespace cudf::io::nvcomp {
+namespace cudf::io::detail::nvcomp {
 /**
  * @brief Device batch decompression of given type.
  *
@@ -103,4 +103,4 @@ void batched_compress(compression_type compression,
                       device_span<compression_result> results,
                       rmm::cuda_stream_view stream);
 
-}  // namespace cudf::io::nvcomp
+}  // namespace cudf::io::detail::nvcomp
diff --git a/cpp/src/io/comp/snap.cu b/cpp/src/io/comp/snap.cu
index 7d4dcffa713..1443bfd38a2 100644
--- a/cpp/src/io/comp/snap.cu
+++ b/cpp/src/io/comp/snap.cu
@@ -19,8 +19,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
-namespace io {
+namespace cudf::io::detail {
 constexpr int hash_bits = 12;
 
 // TBD: Tentatively limits to 2-byte codes to prevent long copy search followed by long literal
@@ -329,7 +328,6 @@ CUDF_KERNEL void __launch_bounds__(128)
     results[blockIdx.x].bytes_written = s->dst - s->dst_base;
     results[blockIdx.x].status =
       (s->dst > s->end) ? compression_status::FAILURE : compression_status::SUCCESS;
-    results[blockIdx.x].reserved = 0;
   }
 }
 
@@ -345,5 +343,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
   }
 }
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index faf967041bc..caee9145d2c 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -21,7 +21,7 @@
 #include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
-namespace cudf::io {
+namespace cudf::io::detail {
 
 writer_compression_statistics collect_compression_statistics(
   device_span<device_span<uint8_t const> const> inputs,
@@ -61,4 +61,4 @@ writer_compression_statistics collect_compression_statistics(
                                        output_size_successful};
 }
 
-}  // namespace cudf::io
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/comp/unsnap.cu b/cpp/src/io/comp/unsnap.cu
index b48e49ffd78..cf841c435a3 100644
--- a/cpp/src/io/comp/unsnap.cu
+++ b/cpp/src/io/comp/unsnap.cu
@@ -21,12 +21,10 @@
 
 #include <cub/cub.cuh>
 
-namespace cudf {
-namespace io {
+namespace cudf::io::detail {
 constexpr int32_t batch_size    = (1 << 5);
 constexpr int32_t batch_count   = (1 << 2);
 constexpr int32_t prefetch_size = (1 << 9);  // 512B, in 32B chunks
-constexpr bool log_cyclecount   = false;
 
 void __device__ busy_wait(size_t cycles)
 {
@@ -65,7 +63,8 @@ struct unsnap_queue_s {
  * @brief snappy decompression state
  */
 struct unsnap_state_s {
-  constexpr unsnap_state_s() noexcept {}  // required to compile on ctk-12.2 + aarch64
+  CUDF_HOST_DEVICE constexpr unsnap_state_s() noexcept {
+  }  // required to compile on ctk-12.2 + aarch64
 
   uint8_t const* base{};           ///< base ptr of compressed stream
   uint8_t const* end{};            ///< end of compressed stream
@@ -646,7 +645,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     auto cur       = s->src.begin();
     auto const end = s->src.end();
     s->error       = 0;
-    if (log_cyclecount) { s->tstart = clock(); }
     if (cur < end) {
       // Read uncompressed size (varint), limited to 32-bit
       uint32_t uncompressed_size = *cur++;
@@ -704,11 +702,6 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     results[strm_id].bytes_written = s->uncompressed_size - s->bytes_left;
     results[strm_id].status =
       (s->error == 0) ? compression_status::SUCCESS : compression_status::FAILURE;
-    if (log_cyclecount) {
-      results[strm_id].reserved = clock() - s->tstart;
-    } else {
-      results[strm_id].reserved = 0;
-    }
   }
 }
 
@@ -723,5 +716,4 @@ void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
   unsnap_kernel<128><<<dim_grid, dim_block, 0, stream.value()>>>(inputs, outputs, results);
 }
 
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 6c84b53db46..0d51526d925 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,20 +21,20 @@
 
 #include "csv_common.hpp"
 #include "csv_gpu.hpp"
-#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/io_uncomp.hpp"
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/csv.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -771,7 +771,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
     if (!reader_opts.is_enabled_mangle_dupe_cols()) {
       for (auto& col_name : column_names) {
         if (++col_names_counts[col_name] > 1) {
-          CUDF_LOG_WARN("Multiple columns with name {}; only the first appearance is parsed",
+          CUDF_LOG_WARN("Multiple columns with name %s; only the first appearance is parsed",
                         col_name);
 
           auto const idx    = &col_name - column_names.data();
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 0e70984b39c..5685b50c322 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -307,12 +308,14 @@ class WriteCoalescingCallbackWrapper {
   {
     __syncthreads();
     if constexpr (!DiscardTranslatedOutput) {
-      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+      for (thread_index_type out_char = threadIdx.x; out_char < tile_out_count;
+           out_char += blockDim.x) {
         out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char];
       }
     }
     if constexpr (!DiscardIndexOutput) {
-      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+      for (thread_index_type out_char = threadIdx.x; out_char < tile_out_count;
+           out_char += blockDim.x) {
         out_idx_it[tile_out_offset + out_char] =
           temp_storage.compacted_offset[out_char] + tile_in_offset;
       }
@@ -342,8 +345,9 @@ class WriteCoalescingCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __device__ __forceinline__ StateVectorTransitionOp(
-    TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
+  __device__ __forceinline__
+  StateVectorTransitionOp(TransitionTableT const& transition_table,
+                          cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
@@ -360,7 +364,7 @@ class StateVectorTransitionOp {
   }
 
  public:
-  std::array<StateIndexT, NUM_INSTANCES>& state_vector;
+  cuda::std::array<StateIndexT, NUM_INSTANCES>& state_vector;
   TransitionTableT const& transition_table;
 };
 
@@ -620,7 +624,7 @@ struct AgentDFA {
     SymbolItT d_chars,
     OffsetT const block_offset,
     OffsetT const num_total_symbols,
-    std::array<StateIndexT, NUM_STATES>& state_vector)
+    cuda::std::array<StateIndexT, NUM_STATES>& state_vector)
   {
     using StateVectorTransitionOpT = StateVectorTransitionOp<NUM_STATES, TransitionTableT>;
 
@@ -796,10 +800,10 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
-    std::array<StateIndexT, NUM_STATES> state_vector;
+    cuda::std::array<StateIndexT, NUM_STATES> state_vector;
 
     // Initialize the seed state transition vector with the identity vector
-    thrust::sequence(thrust::seq, std::begin(state_vector), std::end(state_vector));
+    thrust::sequence(thrust::seq, cuda::std::begin(state_vector), cuda::std::end(state_vector));
 
     // Compute the state transition vector
     agent_dfa.GetThreadStateTransitionVector<NUM_STATES>(symbol_matcher,
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index 0f1fc7d572b..98641f2c893 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -513,6 +513,12 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
       stream));
   }
 
+  // Check if the last element of d_kv_operations is 0. If not, then we have a problem.
+  if (num_symbols_in && !supports_reset_op) {
+    StackOpT last_symbol = d_kv_ops_current.element(num_symbols_in - 1, stream);
+    CUDF_EXPECTS(last_symbol.stack_level == 0, "The logical stack is not empty!");
+  }
+
   // Stable radix sort, sorting by stack level of the operations
   d_kv_operations_unsigned = cub::DoubleBuffer<StackOpUnsignedT>{
     reinterpret_cast<StackOpUnsignedT*>(d_kv_operations.Current()),
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7fafa885c66..7b9fc25d1cc 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -222,18 +222,19 @@ struct json_column_data {
 using hashmap_of_device_columns =
   std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>>;
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr);
 
 void scatter_offsets(tree_meta_t const& tree,
                      device_span<NodeIndexT const> col_ids,
@@ -242,6 +243,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream);
 
@@ -363,17 +365,17 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
     return std::vector<uint8_t>();
   }();
-  auto const [ignore_vals, columns] = build_tree(root,
-                                                 is_str_column_all_nulls,
-                                                 d_column_tree,
-                                                 d_unique_col_ids,
-                                                 d_max_row_offsets,
-                                                 column_names,
-                                                 row_array_parent_col_id,
-                                                 is_array_of_arrays,
-                                                 options,
-                                                 stream,
-                                                 mr);
+  auto const [ignore_vals, is_mixed_pruned, columns] = build_tree(root,
+                                                                  is_str_column_all_nulls,
+                                                                  d_column_tree,
+                                                                  d_unique_col_ids,
+                                                                  d_max_row_offsets,
+                                                                  column_names,
+                                                                  row_array_parent_col_id,
+                                                                  is_array_of_arrays,
+                                                                  options,
+                                                                  stream,
+                                                                  mr);
   if (ignore_vals.empty()) return;
   scatter_offsets(tree,
                   col_ids,
@@ -382,22 +384,24 @@ void make_device_json_column(device_span<SymbolT const> input,
                   sorted_col_ids,
                   d_column_tree,
                   ignore_vals,
+                  is_mixed_pruned,
                   columns,
                   stream);
 }
 
-std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree(
-  device_json_column& root,
-  host_span<uint8_t const> is_str_column_all_nulls,
-  tree_meta_t& d_column_tree,
-  device_span<NodeIndexT const> d_unique_col_ids,
-  device_span<size_type const> d_max_row_offsets,
-  std::vector<std::string> const& column_names,
-  NodeIndexT row_array_parent_col_id,
-  bool is_array_of_arrays,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+std::
+  tuple<cudf::detail::host_vector<bool>, cudf::detail::host_vector<bool>, hashmap_of_device_columns>
+  build_tree(device_json_column& root,
+             host_span<uint8_t const> is_str_column_all_nulls,
+             tree_meta_t& d_column_tree,
+             device_span<NodeIndexT const> d_unique_col_ids,
+             device_span<size_type const> d_max_row_offsets,
+             std::vector<std::string> const& column_names,
+             NodeIndexT row_array_parent_col_id,
+             bool is_array_of_arrays,
+             cudf::io::json_reader_options const& options,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr)
 {
   bool const is_enabled_lines                 = options.is_enabled_lines();
   bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string();
@@ -488,7 +492,9 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
   // NoPruning: iterate through schema and enforce type.
 
   if (adj[parent_node_sentinel].empty())
-    return {cudf::detail::make_host_vector<bool>(0, stream), {}};  // for empty file
+    return {cudf::detail::make_host_vector<bool>(0, stream),
+            cudf::detail::make_host_vector<bool>(0, stream),
+            {}};  // for empty file
   CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1");
   auto expected_types = cudf::detail::make_host_vector<NodeT>(num_columns, stream);
   std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES);
@@ -551,11 +557,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       auto list_child = schema.child_types.at(this_list_child_name);
       for (auto const& child_id : child_ids)
         mark_is_pruned(child_id, list_child);
+      // TODO: Store null map of non-target types for list children to mark list entry as null.
     }
   };
   if (is_array_of_arrays) {
     if (adj[adj[parent_node_sentinel][0]].empty())
-      return {cudf::detail::make_host_vector<bool>(0, stream), {}};
+      return {cudf::detail::make_host_vector<bool>(0, stream),
+              cudf::detail::make_host_vector<bool>(0, stream),
+              {}};
     auto root_list_col_id =
       is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0];
     // mark root and row array col_id as not pruned.
@@ -647,8 +656,12 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
       ? adj[parent_node_sentinel][0]
       : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]);
 
+  // List children which are pruned mixed types, nullify parent list row.
+  auto is_mixed_pruned = cudf::detail::make_host_vector<bool>(num_columns, stream);
+  std::fill_n(is_mixed_pruned.begin(), num_columns, false);
   auto handle_mixed_types = [&column_categories,
                              &is_str_column_all_nulls,
+                             &is_mixed_pruned,
                              &is_pruned,
                              &expected_types,
                              &is_enabled_mixed_types_as_string,
@@ -794,6 +807,14 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                      "list child column insertion failed, duplicate column name in the parent");
         ref.get().column_order.emplace_back(list_child_name);
         auto this_ref = std::ref(ref.get().child_columns.at(list_child_name));
+        if (options.is_enabled_experimental()) {
+          for (auto const& child_id : child_ids) {
+            if (is_pruned[child_id]) {
+              // store this child_id for mixed_type nullify parent list_id.
+              is_mixed_pruned[child_id] = is_pruned[child_id];
+            }
+          }
+        }
         // Mixed type handling
         handle_mixed_types(child_ids);
         if (child_ids.empty()) {
@@ -829,7 +850,7 @@ std::pair<cudf::detail::host_vector<bool>, hashmap_of_device_columns> build_tree
                  [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; });
   cudf::detail::cuda_memcpy_async<NodeT>(d_column_tree.node_categories, expected_types, stream);
 
-  return {is_pruned, columns};
+  return {is_pruned, is_mixed_pruned, columns};
 }
 
 void scatter_offsets(tree_meta_t const& tree,
@@ -839,6 +860,7 @@ void scatter_offsets(tree_meta_t const& tree,
                      device_span<size_type> sorted_col_ids,  // Reuse this for parent_col_ids
                      tree_meta_t const& d_column_tree,
                      host_span<const bool> ignore_vals,
+                     host_span<const bool> is_mixed_pruned,
                      hashmap_of_device_columns const& columns,
                      rmm::cuda_stream_view stream)
 {
@@ -857,6 +879,8 @@ void scatter_offsets(tree_meta_t const& tree,
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
     ignore_vals, stream, cudf::get_current_device_resource_ref());
+  auto d_is_mixed_pruned = cudf::detail::make_device_uvector_async(
+    is_mixed_pruned, stream, cudf::get_current_device_resource_ref());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
     columns_data, stream, cudf::get_current_device_resource_ref());
 
@@ -921,9 +945,31 @@ void scatter_offsets(tree_meta_t const& tree,
              column_categories[col_ids[parent_node_id]] == NC_LIST and
              (!d_ignore_vals[col_ids[parent_node_id]]);
     });
+  // For children of list and in ignore_vals, find it's parent node id, and set corresponding
+  // parent's null mask to null. Setting mixed type list rows to null.
+  auto const num_list_children = thrust::distance(
+    thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()), list_children_end);
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    num_list_children,
+    [node_ids          = node_ids.begin(),
+     parent_node_ids   = tree.parent_node_ids.begin(),
+     column_categories = d_column_tree.node_categories.begin(),
+     col_ids           = col_ids.begin(),
+     row_offsets       = row_offsets.begin(),
+     d_is_mixed_pruned = d_is_mixed_pruned.begin(),
+     d_ignore_vals     = d_ignore_vals.begin(),
+     d_columns_data    = d_columns_data.begin()] __device__(size_type i) {
+      auto const node_id        = node_ids[i];
+      auto const parent_node_id = parent_node_ids[node_id];
+      if (parent_node_id == parent_node_sentinel or d_ignore_vals[col_ids[parent_node_id]]) return;
+      if (column_categories[col_ids[parent_node_id]] == NC_LIST and
+          d_is_mixed_pruned[col_ids[node_id]]) {
+        clear_bit(d_columns_data[col_ids[parent_node_id]].validity, row_offsets[parent_node_id]);
+      }
+    });
 
-  auto const num_list_children =
-    list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin());
   thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream),
                              parent_col_ids.begin(),
                              parent_col_ids.begin() + num_list_children,
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 30a154fdda2..1fe58a0449f 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -464,46 +464,49 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       column_names.emplace_back(
         json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
-      // Note: json_col modified here, reuse the memory
+      // If child is not present, set the null mask correctly, but offsets are zero, and children
+      // are empty. Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(data_type{type_id::INT32},
                                                      num_rows + 1,
                                                      json_col.child_offsets.release(),
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
-      auto child_schema_element =
-        json_col.child_columns.empty() ? std::optional<schema_element>{} : get_list_child_schema();
-      auto [child_column, names] =
-        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
-          ? std::pair<std::unique_ptr<column>,
-                      // EMPTY type could not used because gather throws exception on EMPTY type.
-                      std::vector<column_name_info>>{std::make_unique<column>(
-                                                       data_type{type_id::INT8},
-                                                       0,
-                                                       rmm::device_buffer{},
-                                                       rmm::device_buffer{},
-                                                       0),
-                                                     std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                              d_input,
-                                              options,
-                                              prune_columns,
-                                              child_schema_element,
-                                              stream,
-                                              mr);
+      auto child_schema_element  = get_list_child_schema();
+      auto [child_column, names] = [&]() {
+        if (json_col.child_columns.empty()) {
+          // EMPTY type could not used because gather throws exception on EMPTY type.
+          auto empty_col = make_empty_column(
+            child_schema_element.value_or(schema_element{data_type{type_id::INT8}}), stream, mr);
+          auto children_metadata = std::vector<column_name_info>{
+            make_column_name_info(
+              child_schema_element.value_or(schema_element{data_type{type_id::INT8}}),
+              list_child_name)
+              .children};
+
+          return std::pair<std::unique_ptr<column>, std::vector<column_name_info>>{
+            std::move(empty_col), children_metadata};
+        }
+        return device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                                 d_input,
+                                                 options,
+                                                 prune_columns,
+                                                 child_schema_element,
+                                                 stream,
+                                                 mr);
+      }();
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
-      auto ret_col                      = make_lists_column(num_rows,
-                                       std::move(offsets_column),
-                                       std::move(child_column),
-                                       0,
-                                       rmm::device_buffer{0, stream, mr},
-                                       stream,
-                                       mr);
-      // The null_mask is set after creation of list column is to skip the purge_nonempty_nulls and
-      // null validation applied in make_lists_column factory, which is not needed for json
-      // parent column cannot be null when its children is non-empty in JSON
-      if (null_count != 0) { ret_col->set_null_mask(std::move(result_bitmask), null_count); }
+      auto ret_col                      = make_lists_column(
+        num_rows,
+        std::move(offsets_column),
+        std::move(child_column),
+        null_count,
+        null_count == 0 ? rmm::device_buffer{0, stream, mr} : std::move(result_bitmask),
+        stream,
+        mr);
+      // Since some rows in child column may need to be nullified due to mixed types, we can not
+      // skip the purge_nonempty_nulls call in make_lists_column factory
       return {std::move(ret_col), std::move(column_names)};
     }
     default: CUDF_FAIL("Unsupported column type"); break;
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 4989fff4b30..cc5f256ea80 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -429,6 +429,18 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Create empty column of a given nested schema
+ *
+ * @param schema The schema of the column to create
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr resource with which to allocate
+ * @return The empty column
+ */
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr);
+
 /**
  * @brief Create all null column of a given nested schema
  *
@@ -452,17 +464,6 @@ std::unique_ptr<column> make_all_nulls_column(schema_element const& schema,
  */
 column_name_info make_column_name_info(schema_element const& schema, std::string const& col_name);
 
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the column
- * @param options json reader options which holds schema
- * @return data type of the column if present
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options);
-
 /**
  * @brief Helper class to get path of a column by column id from reduced column tree
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index f1c2826c62a..30a28a1cf98 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1473,10 +1473,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
                                   to_stack_op::start_state,
                                   stream);
 
-  auto stack_ops_bufsize = d_num_stack_ops.value(stream);
+  // Copy back to actual number of stack operations
+  auto num_stack_ops = d_num_stack_ops.value(stream);
   // Sequence of stack symbols and their position in the original input (sparse representation)
-  rmm::device_uvector<StackSymbolT> stack_ops{stack_ops_bufsize, stream};
-  rmm::device_uvector<SymbolOffsetT> stack_op_indices{stack_ops_bufsize, stream};
+  rmm::device_uvector<StackSymbolT> stack_ops{num_stack_ops, stream};
+  rmm::device_uvector<SymbolOffsetT> stack_op_indices{num_stack_ops, stream};
 
   // Run bracket-brace FST to retrieve starting positions of structs and lists
   json_to_stack_ops_fst.Transduce(json_in.begin(),
@@ -1487,9 +1488,6 @@ void get_stack_context(device_span<SymbolT const> json_in,
                                   to_stack_op::start_state,
                                   stream);
 
-  // Copy back to actual number of stack operations
-  auto const num_stack_ops = d_num_stack_ops.value(stream);
-
   // Stack operations with indices are converted to top of the stack for each character in the input
   if (stack_behavior == stack_behavior_t::ResetOnDelimiter) {
     fst::sparse_stack_op_to_top_of_stack<fst::stack_op_support::WITH_RESET_SUPPORT, StackLevelT>(
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index ced7acb9cde..4b4827ca8d9 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -68,78 +68,6 @@ void json_reader_options::set_dtypes(schema_element types)
 }  // namespace cudf::io
 
 namespace cudf::io::json::detail {
-namespace {
-
-// example schema and its path.
-// "a": int             {"a", int}
-// "a": [ int ]         {"a", list}, {"element", int}
-// "a": { "b": int}     {"a", struct}, {"b", int}
-// "a": [ {"b": int }]  {"a", list}, {"element", struct}, {"b", int}
-// "a": [ null]         {"a", list}, {"element", str}
-// back() is root.
-// front() is leaf.
-/**
- * @brief Get the path data type of a column by path if present in input schema
- *
- * @param path path of the json column
- * @param root root of input schema element
- * @return data type of the column if present, otherwise std::nullopt
- */
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
-{
-  if (path.empty() || path.size() == 1) {
-    return root.type;
-  } else {
-    if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
-      auto const child_name      = path.first(path.size() - 1).back().first;
-      auto const child_schema_it = root.child_types.find(child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
-      auto const child_schema_it = root.child_types.find(list_child_name);
-      return (child_schema_it != std::end(root.child_types))
-               ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
-               : std::optional<data_type>{};
-    }
-    return std::optional<data_type>{};
-  }
-}
-
-std::optional<schema_element> child_schema_element(std::string const& col_name,
-                                                   cudf::io::json_reader_options const& options)
-{
-  return std::visit(
-    cudf::detail::visitor_overload{
-      [col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-        auto column_index = atol(col_name.data());
-        return (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                 ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                 : std::optional<schema_element>{};
-      },
-      [col_name](
-        std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.find(col_name) != std::end(user_dtypes))
-                 ? user_dtypes.find(col_name)->second
-                 : std::optional<schema_element>{};
-      },
-      [col_name](schema_element const& user_dtypes) -> std::optional<schema_element> {
-        return (user_dtypes.child_types.find(col_name) != std::end(user_dtypes.child_types))
-                 ? user_dtypes.child_types.find(col_name)->second
-                 : std::optional<schema_element>{};
-      }},
-    options.get_dtypes());
-}
-
-}  // namespace
-
 /// Created an empty column of the specified schema
 struct empty_column_functor {
   rmm::cuda_stream_view stream;
@@ -159,7 +87,17 @@ struct empty_column_functor {
     std::unique_ptr<column> child = cudf::type_dispatcher(
       schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name));
     auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
-    return make_lists_column(0, std::move(offsets), std::move(child), 0, {}, stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    0,
+                                    rmm::device_buffer{},
+                                    rmm::device_buffer{},
+                                    0,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -174,6 +112,13 @@ struct empty_column_functor {
   }
 };
 
+std::unique_ptr<column> make_empty_column(schema_element const& schema,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::device_async_resource_ref mr)
+{
+  return cudf::type_dispatcher(schema.type, empty_column_functor{stream, mr}, schema);
+}
+
 /// Created all null column of the specified schema
 struct allnull_column_functor {
   rmm::cuda_stream_view stream;
@@ -198,10 +143,9 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "Dictionary column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
     return make_fixed_width_column(schema.type, size, mask_state::ALL_NULL, stream, mr);
     auto indices   = make_zeroed_offsets(size - 1);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
@@ -221,14 +165,22 @@ struct allnull_column_functor {
   std::unique_ptr<column> operator()(schema_element const& schema, size_type size) const
   {
     CUDF_EXPECTS(schema.child_types.size() == 1, "List column should have only one child");
-    auto const& child_name        = schema.child_types.begin()->first;
-    std::unique_ptr<column> child = cudf::type_dispatcher(schema.child_types.at(child_name).type,
-                                                          empty_column_functor{stream, mr},
-                                                          schema.child_types.at(child_name));
-    auto offsets                  = make_zeroed_offsets(size);
+    auto const& child_name = schema.child_types.begin()->first;
+    std::unique_ptr<column> child =
+      make_empty_column(schema.child_types.at(child_name), stream, mr);
+    auto offsets   = make_zeroed_offsets(size);
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_lists_column(
-      size, std::move(offsets), std::move(child), size, std::move(null_mask), stream, mr);
+    std::vector<std::unique_ptr<column>> child_columns;
+    child_columns.push_back(std::move(offsets));
+    child_columns.push_back(std::move(child));
+    // Do not use `cudf::make_lists_column` since we do not need to call `purge_nonempty_nulls` on
+    // the child column as it does not have non-empty nulls. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::LIST},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, cudf::struct_view>)>
@@ -240,8 +192,14 @@ struct allnull_column_functor {
         schema.child_types.at(child_name).type, *this, schema.child_types.at(child_name), size));
     }
     auto null_mask = cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
-    return make_structs_column(
-      size, std::move(child_columns), size, std::move(null_mask), stream, mr);
+    // Do not use `cudf::make_structs_column` since we do not need to call `superimpose_nulls` on
+    // the children columns. Look issue #17356
+    return std::make_unique<column>(cudf::data_type{type_id::STRUCT},
+                                    size,
+                                    rmm::device_buffer{},
+                                    std::move(null_mask),
+                                    size,
+                                    std::move(child_columns));
   }
 };
 
@@ -281,48 +239,4 @@ column_name_info make_column_name_info(schema_element const& schema, std::string
   }
   return info;
 }
-
-std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
-  cudf::io::json_reader_options const& options)
-{
-  if (path.empty()) return {};
-  std::optional<schema_element> col_schema = child_schema_element(path.back().first, options);
-  // check if it has value, then do recursive call and return.
-  if (col_schema.has_value()) {
-    return get_path_data_type(path, col_schema.value());
-  } else {
-    return {};
-  }
-}
-
-// idea: write a memoizer using template and lambda?, then call recursively.
-std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
-{
-  std::vector<path_rep> path;
-  // stops at root.
-  while (this_col_id != parent_node_sentinel) {
-    auto type        = column_categories[this_col_id];
-    std::string name = "";
-    // code same as name_and_parent_index lambda.
-    auto parent_col_id = column_parent_ids[this_col_id];
-    if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
-        name = column_names[this_col_id];
-      } else {
-        name = list_child_name;
-      }
-    } else if (column_categories[parent_col_id] == NC_FN) {
-      auto field_name_col_id = parent_col_id;
-      parent_col_id          = column_parent_ids[parent_col_id];
-      name                   = column_names[field_name_col_id];
-    }
-    // "name": type/schema
-    path.emplace_back(name, type);
-    this_col_id = parent_col_id;
-    if (this_col_id == row_array_parent_col_id) return path;
-  }
-  return {};
-}
-
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 0cb5c382631..469f933f918 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 
@@ -180,9 +181,9 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
   for (size_type i = 0; i < dict.map_slots.size(); i += block_size) {
     if (t + i < dict.map_slots.size()) {
-      auto window = dict.map_slots.begin() + t + i;
-      // Collect all slots from each window.
-      for (auto& slot : *window) {
+      auto bucket = dict.map_slots.begin() + t + i;
+      // Collect all slots from each bucket.
+      for (auto& slot : *bucket) {
         auto const key = slot.first;
         if (key != KEY_SENTINEL) {
           auto loc       = counter.fetch_add(1, memory_order_relaxed);
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 0949fafe9a4..f4e75f78dec 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include "io/comp/gpuinflate.hpp"
+#include "io/comp/comp.hpp"
 #include "io/statistics/statistics.cuh"
 #include "io/utilities/column_buffer.hpp"
 #include "orc.hpp"
@@ -47,16 +47,16 @@ using slot_type   = cuco::pair<key_type, mapped_type>;
 auto constexpr map_cg_size =
   1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
       ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
-auto constexpr window_size =
+auto constexpr bucket_size =
   1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
 auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
                                           ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
-using storage_type     = cuco::aow_storage<slot_type,
-                                       window_size,
-                                       cuco::extent<std::size_t>,
-                                       cudf::detail::cuco_allocator<char>>;
+using storage_type     = cuco::bucket_storage<slot_type,
+                                          bucket_size,
+                                          cuco::extent<std::size_t>,
+                                          cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = typename storage_type::window_type;
+using bucket_type      = typename storage_type::bucket_type;
 using slot_type        = cuco::pair<key_type, mapped_type>;
 
 auto constexpr KEY_SENTINEL   = size_type{-1};
@@ -73,14 +73,14 @@ struct CompressedStreamInfo {
   uint8_t const* compressed_data{};  // [in] base ptr to compressed stream data
   uint8_t*
     uncompressed_data{};  // [in] base ptr to uncompressed stream data or NULL if not known yet
-  size_t compressed_data_size{};              // [in] compressed data size for this stream
-  device_span<uint8_t const>* dec_in_ctl{};   // [in] input buffer to decompress
-  device_span<uint8_t>* dec_out_ctl{};        // [in] output buffer to decompress into
-  device_span<compression_result> dec_res{};  // [in] results of decompression
-  device_span<uint8_t const>* copy_in_ctl{};  // [out] input buffer to copy
-  device_span<uint8_t>* copy_out_ctl{};       // [out] output buffer to copy to
-  uint32_t num_compressed_blocks{};           // [in,out] number of entries in decctl(in), number of
-                                              // compressed blocks(out)
+  size_t compressed_data_size{};             // [in] compressed data size for this stream
+  device_span<uint8_t const>* dec_in_ctl{};  // [in] input buffer to decompress
+  device_span<uint8_t>* dec_out_ctl{};       // [in] output buffer to decompress into
+  device_span<cudf::io::detail::compression_result> dec_res{};  // [in] results of decompression
+  device_span<uint8_t const>* copy_in_ctl{};                    // [out] input buffer to copy
+  device_span<uint8_t>* copy_out_ctl{};                         // [out] output buffer to copy to
+  uint32_t num_compressed_blocks{};    // [in,out] number of entries in decctl(in), number of
+                                       // compressed blocks(out)
   uint32_t num_uncompressed_blocks{};  // [in,out] number of entries in dec_in_ctl(in), number of
                                        // uncompressed blocks(out)
   uint64_t max_uncompressed_size{};    // [out] maximum uncompressed data size of stream
@@ -193,7 +193,7 @@ struct StripeStream {
  */
 struct stripe_dictionary {
   // input
-  device_span<window_type> map_slots;  // hash map (windows) storage
+  device_span<bucket_type> map_slots;  // hash map (buckets) storage
   uint32_t column_idx      = 0;        // column index
   size_type start_row      = 0;        // first row in the stripe
   size_type start_rowgroup = 0;        // first rowgroup in the stripe
@@ -414,7 +414,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
   bool collect_statistics,
   device_2dspan<StripeStream> strm_desc,
   device_2dspan<encoder_chunk_streams> enc_streams,
-  device_span<compression_result> comp_res,
+  device_span<cudf::io::detail::compression_result> comp_res,
   rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index fcaee9c548e..726c79bd004 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 0081ed30d17..b661bb4ff90 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -269,7 +269,7 @@ rmm::device_buffer decompress_stripe_data(
                                                          num_uncompressed_blocks};
     device_span<device_span<uint8_t>> copy_out_view{inflate_out.data() + num_compressed_blocks,
                                                     num_uncompressed_blocks};
-    gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
+    cudf::io::detail::gpu_copy_uncompressed_blocks(copy_in_view, copy_out_view, stream);
   }
 
   // Copy without stream sync, thus need to wait for stream sync below to access.
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1572b7246c0..1f84d1f81dc 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,6 +132,177 @@ struct orcdec_state_s {
   } vals;
 };
 
+/**
+ * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group.
+ *
+ * This class is used to address a special case, where the first run of the DATA stream spans two
+ * adjacent row groups and its length is greater than the maximum length allowed to be consumed.
+ * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be
+ * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type
+ * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a
+ * local variable and does not reside in the shared memory.
+ */
+class run_cache_manager {
+ private:
+  enum class status : uint8_t {
+    DISABLED,  ///< Run cache manager is disabled. No caching will be performed. If the special case
+               ///< happens, the run cache manager will be set to this status after the cache read
+               ///< is completed. This status also applies when the special case does not happen.
+    CAN_WRITE_TO_CACHE,  ///< Run cache manager is ready for write. If the special case happens, the
+                         ///< run cache manager will be set to this status.
+    CAN_READ_FROM_CACHE,  ///< Run cache manager is ready for read. If the special case happens, the
+                          ///< run cache manager will be set to this status after the cache write is
+                          ///< completed.
+  };
+
+ public:
+  /**
+   * @brief Initialize the run cache manager.
+   *
+   * @param[in] s ORC decoder state.
+   */
+  __device__ void initialize(orcdec_state_s* s)
+  {
+    _status          = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP)
+                         ? status::CAN_WRITE_TO_CACHE
+                         : status::DISABLED;
+    _reusable_length = 0;
+    _run_length      = 0;
+  }
+
+ private:
+  status _status;  ///< The status of the run cache manager.
+  uint32_t
+    _reusable_length;  ///< The number of data to be cached and reused later. For example, if a run
+                       ///< has a length of 512 but the maximum length allowed to be consumed is
+                       ///< capped at 162, then 350 (512-162) data will be cached.
+  uint32_t _run_length;  ///< The length of the run, 512 in the above example.
+  friend class cache_helper;
+};
+
+/**
+ * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for
+ * a row group.
+ *
+ * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is
+ * in the local storage (as an optimization). If a function is to use run_cache_manager, both the
+ * manager and the cache objects need to be passed. This class is introduced to simplify the
+ * function call, so that only a single cache_helper object needs to be passed. To that end, public
+ * methods originally belonging to run_cache_manager have been moved to this class.
+ */
+class cache_helper {
+ public:
+  /**
+   * @brief Constructor.
+   *
+   * @param[in] run_cache_manager_inst An instance of run_cache_manager.
+   */
+  __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst)
+    : _manager(run_cache_manager_inst)
+  {
+  }
+
+  /**
+   * @brief Set the reusable length object.
+   *
+   * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the
+   * DATA stream.
+   * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed
+   * by the decoder when processing the SECONDARY stream.
+   */
+  __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length)
+  {
+    if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) {
+      _manager._run_length = run_length;
+      _manager._reusable_length =
+        (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0;
+    }
+  }
+
+  /**
+   * @brief Adjust the maximum length allowed to be consumed when the length of the first run is
+   * greater than it.
+   *
+   * @param[in] max_length The maximum length allowed to be consumed for the DATA stream.
+   * @return A new maximum length.
+   */
+  [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length)
+  {
+    auto new_max_length{max_length};
+    if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) {
+      new_max_length -= _manager._reusable_length;
+    }
+    return new_max_length;
+  }
+
+  /**
+   * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache.
+   *
+   * @param[in] src Intermediate buffer for the DATA stream.
+   */
+  __device__ void write_to_cache(int64_t* src)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    __syncthreads();
+
+    // All threads in the block always take a uniform code path for the following branches.
+    // _reusable_length ranges between [0, 512].
+    if (_manager._reusable_length > 0) {
+      auto const length_to_skip = _manager._run_length - _manager._reusable_length;
+      if (tid < _manager._reusable_length) {
+        auto const src_idx = tid + length_to_skip;
+        _storage           = src[src_idx];
+      }
+      if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; }
+    } else {
+      if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; }
+    }
+
+    __syncthreads();
+  }
+
+  /**
+   * @brief Copy the cached data to the intermediate buffer for the DATA stream.
+   *
+   * @param[in,out] dst Intermediate buffer for the DATA stream.
+   * @param[in,out] rle Run length decoder state object.
+   */
+  __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    // First, shift the data up
+    auto const dst_idx = tid + _manager._reusable_length;
+    auto const v       = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0;
+    __syncthreads();
+
+    if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; }
+    __syncthreads();
+
+    // Second, insert the cached data
+    if (tid < _manager._reusable_length) { dst[tid] = _storage; }
+    __syncthreads();
+
+    if (tid == 0) {
+      // Disable the run cache manager, since cache write-and-read happens at most once per row
+      // group.
+      _manager._status = run_cache_manager::status::DISABLED;
+      rle->num_vals += _manager._reusable_length;
+    }
+
+    __syncthreads();
+  }
+
+ private:
+  run_cache_manager& _manager;  ///< An instance of run_cache_manager.
+  int64_t _storage;             ///< Per-thread cache storage.
+};
+
 /**
  * @brief Initializes byte stream, modifying length and start position to keep the read pointer
  * 8-byte aligned.
@@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] maxvals maximum number of values to decode
  * @param[in] t thread id
  * @param[in] has_buffered_values If true, means there are already buffered values
+ * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage
+ * caching of the first run of the DATA stream.
  *
  * @return number of values decoded
  */
@@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
                                          T* vals,
                                          uint32_t maxvals,
                                          int t,
-                                         bool has_buffered_values = false)
+                                         bool has_buffered_values        = false,
+                                         cache_helper* cache_helper_inst = nullptr)
 {
   if (t == 0) {
+    if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); }
     uint32_t maxpos  = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
     uint32_t lastpos = bs->pos;
     auto numvals     = 0;
@@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
           l += deltapos;
         }
       }
+
+      if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); }
+
       if ((numvals != 0) and (numvals + n > maxvals)) break;
       // case where there are buffered values and can't consume a whole chunk
       // from decoded values, so skip adding any more to buffer, work on buffered values and then
@@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
     __syncwarp();
   }
   __syncthreads();
+  // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the
+  // data type is int64_t.
+  if constexpr (cuda::std::is_same_v<T, int64_t>) {
+    if (cache_helper_inst != nullptr) {
+      // Run cache is read from during the 2nd iteration of the top-level while loop in
+      // gpuDecodeOrcColumnData().
+      cache_helper_inst->read_from_cache(vals, rle);
+      // Run cache is written to during the 1st iteration of the loop.
+      cache_helper_inst->write_to_cache(vals);
+    }
+  }
   return rle->num_vals;
 }
 
@@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   // Struct doesn't have any data in itself, so skip
   bool const is_valid       = s->chunk.type_kind != STRUCT;
   size_t const max_num_rows = s->chunk.column_num_rows;
+  __shared__ run_cache_manager run_cache_manager_inst;
+  cache_helper cache_helper_inst(run_cache_manager_inst);
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
@@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
+
+    run_cache_manager_inst.initialize(s);
   }
   __syncthreads();
 
@@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         if (is_rlev1(s->chunk.encoding_kind)) {
           numvals = Integer_RLEv1<int64_t>(bs, &s->u.rlev1, s->vals.i64, numvals, t);
         } else {
-          numvals = Integer_RLEv2<int64_t>(bs, &s->u.rlev2, s->vals.i64, numvals, t);
+          numvals = Integer_RLEv2<int64_t>(bs,
+                                           &s->u.rlev2,
+                                           s->vals.i64,
+                                           numvals,
+                                           t,
+                                           false /**has_buffered_values */,
+                                           &cache_helper_inst);
         }
         if (s->chunk.type_kind == DECIMAL) {
           // If we're using an index, we may have to drop values from the initial run
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ed0b6969154..79ecca0ca99 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "io/utilities/time_utils.cuh"
@@ -23,10 +24,10 @@
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -44,7 +45,11 @@ namespace io {
 namespace orc {
 namespace gpu {
 
+namespace nvcomp = cudf::io::detail::nvcomp;
+
 using cudf::detail::device_2dspan;
+using cudf::io::detail::compression_result;
+using cudf::io::detail::compression_status;
 
 constexpr int scratch_buffer_size        = 512 * 4;
 constexpr int compact_streams_block_size = 1024;
@@ -1385,7 +1390,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
   if (compression == SNAPPY) {
     try {
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
+        cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream);
       } else {
         nvcomp::batched_compress(
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
@@ -1429,7 +1434,7 @@ std::optional<writer_compression_statistics> CompressOrcDataStreams(
     strm_desc, comp_in, comp_out, comp_res, compressed_data, comp_blk_size, max_comp_blk_size);
 
   if (collect_statistics) {
-    return cudf::io::collect_compression_statistics(comp_in, comp_res, stream);
+    return cudf::io::detail::collect_compression_statistics(comp_in, comp_res, stream);
   } else {
     return std::nullopt;
   }
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index d432deb8e79..ce868b83c04 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -19,7 +19,6 @@
  * @brief cuDF-IO ORC writer class implementation
  */
 
-#include "cudf/detail/utilities/cuda_memcpy.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/orc/orc_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
@@ -28,10 +27,12 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -70,6 +71,8 @@
 
 namespace cudf::io::orc::detail {
 
+namespace nvcomp = cudf::io::detail::nvcomp;
+
 template <typename T>
 [[nodiscard]] constexpr int varint_size(T val)
 {
@@ -506,7 +509,7 @@ size_t max_varint_size()
   return cudf::util::div_rounding_up_unsafe(sizeof(T) * 8, 7);
 }
 
-constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
+size_t RLE_stream_size(TypeKind kind, size_t count)
 {
   using cudf::util::div_rounding_up_unsafe;
   constexpr auto byte_rle_max_len = 128;
@@ -1386,29 +1389,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
   // we know the size of each array. The number of stripes per column in a chunk array can
   // be calculated by dividing the number of chunks by the number of columns.
   // That many chunks need to be copied at a time to the proper destination.
-  size_t num_entries_seen = 0;
+  size_t num_entries_seen        = 0;
+  auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2;
+  auto h_srcs = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_dsts = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
+  auto h_lens = cudf::detail::make_empty_host_vector<size_t>(num_buffers_to_copy, stream);
+
   for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
     auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns;
 
-    auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
-    auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
     for (size_t col = 0; col < num_columns; ++col) {
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
-                        chunk_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
-      CUDF_CUDA_TRY(
-        cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
-                        per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
-                        merge_bytes,
-                        cudaMemcpyDefault,
-                        stream.value()));
+      h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col);
+      h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_chunk));
+
+      h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col);
+      h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen);
+      h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group));
     }
     num_entries_seen += stripes_per_col;
   }
 
+  auto const& mr    = cudf::get_current_device_resource_ref();
+  auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
+  auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
+  auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr);
+  cudf::detail::batched_memcpy_async(
+    d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream);
+
   auto file_stats_merge =
     cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
   for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -2017,8 +2025,8 @@ size_t max_compression_output_size(CompressionKind compression_kind, uint32_t co
 {
   if (compression_kind == NONE) return 0;
 
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
-                                        compression_blocksize);
+  return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind),
+                                                compression_blocksize);
 }
 
 std::unique_ptr<table_input_metadata> make_table_meta(table_view const& input)
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index b85ebf2fa1a..0d40a1f7b1b 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/exec_policy.hpp>
@@ -210,7 +211,7 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
+  populate_chunk_hash_maps_kernel(device_span<bucket_type> const map_storage,
                                   cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -239,7 +240,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<window_type> const map_storage,
+  collect_map_entries_kernel(device_span<bucket_type> const map_storage,
                              device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
@@ -251,11 +252,11 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
 
-  // Iterate over all windows in the map.
+  // Iterate over all buckets in the map.
   for (; t < chunk.dict_map_size; t += block_size) {
-    auto window = map_storage.data() + chunk.dict_map_offset + t;
-    // Collect all slots from each window.
-    for (auto& slot : *window) {
+    auto bucket = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each bucket.
+    for (auto& slot : *bucket) {
       auto const key = slot.first;
       if (key != KEY_SENTINEL) {
         auto const loc = counter.fetch_add(1, memory_order_relaxed);
@@ -272,7 +273,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
+  get_dictionary_indices_kernel(device_span<bucket_type> const map_storage,
                                 cudf::detail::device_2dspan<PageFragment const> frags)
 {
   auto const col_idx = blockIdx.y;
@@ -302,7 +303,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   s_ck_start_val_idx);
 }
 
-void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+void populate_chunk_hash_maps(device_span<bucket_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
@@ -311,7 +312,7 @@ void populate_chunk_hash_maps(device_span<window_type> const map_storage,
     <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(device_span<window_type> const map_storage,
+void collect_map_entries(device_span<bucket_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream)
 {
@@ -320,7 +321,7 @@ void collect_map_entries(device_span<window_type> const map_storage,
     <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(device_span<window_type> const map_storage,
+void get_dictionary_indices(device_span<bucket_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 9acbe026bb2..32bb3349666 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -961,9 +961,6 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     return;
   }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
-
   using value_decoder_type = std::conditional_t<
     split_decode_t,
     decode_fixed_width_split_values_func<decode_block_size_t, has_lists_t, state_buf_t>,
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index e9558735929..a1edd21f8a2 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -51,6 +51,9 @@ namespace {
 
 using ::cudf::detail::device_2dspan;
 
+using cudf::io::detail::compression_result;
+using cudf::io::detail::compression_status;
+
 constexpr int encode_block_size = 128;
 constexpr int rle_buffer_size   = 2 * encode_block_size;
 constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 7c09764da2d..800875f7448 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -34,7 +34,7 @@ using slot_type   = cuco::pair<key_type, mapped_type>;
 auto constexpr map_cg_size =
   1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
       ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
-auto constexpr window_size =
+auto constexpr bucket_size =
   1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
 auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
                                           ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
@@ -43,12 +43,12 @@ auto constexpr KEY_SENTINEL   = key_type{-1};
 auto constexpr VALUE_SENTINEL = mapped_type{-1};
 auto constexpr SCOPE          = cuda::thread_scope_block;
 
-using storage_type     = cuco::aow_storage<slot_type,
-                                       window_size,
-                                       cuco::extent<std::size_t>,
-                                       cudf::detail::cuco_allocator<char>>;
+using storage_type     = cuco::bucket_storage<slot_type,
+                                          bucket_size,
+                                          cuco::extent<std::size_t>,
+                                          cudf::detail::cuco_allocator<char>>;
 using storage_ref_type = typename storage_type::ref_type;
-using window_type      = typename storage_type::window_type;
+using bucket_type      = typename storage_type::bucket_type;
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
@@ -100,7 +100,7 @@ inline size_type __device__ row_to_value_idx(size_type idx,
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+void populate_chunk_hash_maps(device_span<bucket_type> const map_storage,
                               cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream);
 
@@ -111,7 +111,7 @@ void populate_chunk_hash_maps(device_span<window_type> const map_storage,
  * @param chunks Flat span of chunks to compact hash maps for
  * @param stream CUDA stream to use
  */
-void collect_map_entries(device_span<window_type> const map_storage,
+void collect_map_entries(device_span<bucket_type> const map_storage,
                          device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
@@ -128,7 +128,7 @@ void collect_map_entries(device_span<window_type> const map_storage,
  * @param frags Column fragments
  * @param stream CUDA stream to use
  */
-void get_dictionary_indices(device_span<window_type> const map_storage,
+void get_dictionary_indices(device_span<bucket_type> const map_storage,
                             cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index ce9d48693ec..b2563ab5065 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "error.hpp"
-#include "io/comp/gpuinflate.hpp"
+#include "io/comp/comp.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_common.hpp"
 #include "io/statistics/statistics.cuh"
@@ -599,12 +599,12 @@ struct EncColumnChunk {
  */
 struct EncPage {
   // all pointers at the top to keep things properly aligned
-  uint8_t* page_data;            //!< Ptr to uncompressed page
-  uint8_t* compressed_data;      //!< Ptr to compressed page
-  EncColumnChunk* chunk;         //!< Chunk that this page belongs to
-  compression_result* comp_res;  //!< Ptr to compression result
-  uint32_t* def_histogram;       //!< Histogram of counts for each definition level
-  uint32_t* rep_histogram;       //!< Histogram of counts for each repetition level
+  uint8_t* page_data;                              //!< Ptr to uncompressed page
+  uint8_t* compressed_data;                        //!< Ptr to compressed page
+  EncColumnChunk* chunk;                           //!< Chunk that this page belongs to
+  cudf::io::detail::compression_result* comp_res;  //!< Ptr to compression result
+  uint32_t* def_histogram;  //!< Histogram of counts for each definition level
+  uint32_t* rep_histogram;  //!< Histogram of counts for each repetition level
   // put this here in case it's ever made 64-bit
   encode_kernel_mask kernel_mask;  //!< Mask used to control which encoding kernels to run
   // the rest can be 4 byte aligned
@@ -1023,7 +1023,7 @@ void EncodePages(device_span<EncPage> pages,
                  bool write_v2_headers,
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
-                 device_span<compression_result> comp_res,
+                 device_span<cudf::io::detail::compression_result> comp_res,
                  rmm::cuda_stream_view stream);
 
 /**
@@ -1046,7 +1046,7 @@ void DecideCompression(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view
  * @param[in] stream CUDA stream to use
  */
 void EncodePageHeaders(device_span<EncPage> pages,
-                       device_span<compression_result const> comp_res,
+                       device_span<cudf::io::detail::compression_result const> comp_res,
                        device_span<statistics_chunk const> page_stats,
                        statistics_chunk const* chunk_stats,
                        rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index b0cbabf1c12..9047ff9169b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -265,7 +265,6 @@ class stats_expression_converter : public ast::detail::expression_transformer {
    */
   std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
   {
-    _stats_expr = std::reference_wrapper<ast::expression const>(expr);
     return expr;
   }
 
@@ -278,7 +277,6 @@ class stats_expression_converter : public ast::detail::expression_transformer {
                  "Statistics AST supports only left table");
     CUDF_EXPECTS(expr.get_column_index() < _num_columns,
                  "Column index cannot be more than number of columns in the table");
-    _stats_expr = std::reference_wrapper<ast::expression const>(expr);
     return expr;
   }
 
@@ -307,6 +305,9 @@ class stats_expression_converter : public ast::detail::expression_transformer {
       CUDF_EXPECTS(dynamic_cast<ast::literal const*>(&operands[1].get()) != nullptr,
                    "Second operand of binary operation with column reference must be a literal");
       v->accept(*this);
+      // Push literal into the ast::tree
+      auto const& literal =
+        _stats_expr.push(*dynamic_cast<ast::literal const*>(&operands[1].get()));
       auto const col_index = v->get_column_index();
       switch (op) {
         /* transform to stats conditions. op(col, literal)
@@ -318,34 +319,33 @@ class stats_expression_converter : public ast::detail::expression_transformer {
         col1 <= val --> vmin <= val
         */
         case ast_operator::EQUAL: {
-          auto const& vmin = _col_ref.emplace_back(col_index * 2);
-          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
-          auto const& op1 =
-            _operators.emplace_back(ast_operator::LESS_EQUAL, vmin, operands[1].get());
-          auto const& op2 =
-            _operators.emplace_back(ast_operator::GREATER_EQUAL, vmax, operands[1].get());
-          _operators.emplace_back(ast::ast_operator::LOGICAL_AND, op1, op2);
+          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+          _stats_expr.push(ast::operation{
+            ast::ast_operator::LOGICAL_AND,
+            _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}),
+            _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})});
           break;
         }
         case ast_operator::NOT_EQUAL: {
-          auto const& vmin = _col_ref.emplace_back(col_index * 2);
-          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
-          auto const& op1  = _operators.emplace_back(ast_operator::NOT_EQUAL, vmin, vmax);
-          auto const& op2 =
-            _operators.emplace_back(ast_operator::NOT_EQUAL, vmax, operands[1].get());
-          _operators.emplace_back(ast_operator::LOGICAL_OR, op1, op2);
+          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+          _stats_expr.push(ast::operation{
+            ast_operator::LOGICAL_OR,
+            _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}),
+            _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})});
           break;
         }
         case ast_operator::LESS: [[fallthrough]];
         case ast_operator::LESS_EQUAL: {
-          auto const& vmin = _col_ref.emplace_back(col_index * 2);
-          _operators.emplace_back(op, vmin, operands[1].get());
+          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+          _stats_expr.push(ast::operation{op, vmin, literal});
           break;
         }
         case ast_operator::GREATER: [[fallthrough]];
         case ast_operator::GREATER_EQUAL: {
-          auto const& vmax = _col_ref.emplace_back(col_index * 2 + 1);
-          _operators.emplace_back(op, vmax, operands[1].get());
+          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+          _stats_expr.push(ast::operation{op, vmax, literal});
           break;
         }
         default: CUDF_FAIL("Unsupported operation in Statistics AST");
@@ -353,13 +353,12 @@ class stats_expression_converter : public ast::detail::expression_transformer {
     } else {
       auto new_operands = visit_operands(operands);
       if (cudf::ast::detail::ast_operator_arity(op) == 2) {
-        _operators.emplace_back(op, new_operands.front(), new_operands.back());
+        _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()});
       } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
-        _operators.emplace_back(op, new_operands.front());
+        _stats_expr.push(ast::operation{op, new_operands.front()});
       }
     }
-    _stats_expr = std::reference_wrapper<ast::expression const>(_operators.back());
-    return std::reference_wrapper<ast::expression const>(_operators.back());
+    return _stats_expr.back();
   }
 
   /**
@@ -369,7 +368,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
    */
   [[nodiscard]] std::reference_wrapper<ast::expression const> get_stats_expr() const
   {
-    return _stats_expr.value().get();
+    return _stats_expr.back();
   }
 
  private:
@@ -383,10 +382,8 @@ class stats_expression_converter : public ast::detail::expression_transformer {
     }
     return transformed_operands;
   }
-  std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
+  ast::tree _stats_expr;
   size_type _num_columns;
-  std::list<ast::column_reference> _col_ref;
-  std::list<ast::operation> _operators;
 };
 }  // namespace
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 27312a4da89..933be889b1a 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -15,6 +15,8 @@
  */
 
 #include "compact_protocol_reader.hpp"
+#include "io/comp/comp.hpp"
+#include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "reader_impl.hpp"
@@ -44,6 +46,10 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
+namespace nvcomp = cudf::io::detail::nvcomp;
+using cudf::io::detail::compression_result;
+using cudf::io::detail::compression_status;
+
 struct split_info {
   row_range rows;
   int64_t split_pos;
@@ -795,14 +801,16 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       num_comp_pages++;
     });
     if (codec.compression_type == BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+      debrotli_scratch.resize(cudf::io::detail::get_gpu_debrotli_scratch_size(codec.num_pages),
+                              stream);
     }
   }
 
   // Dispatch batches of pages to decompress for each codec.
   // Buffer needs to be padded, required by `gpuDecodePageData`.
   rmm::device_buffer decomp_pages(
-    cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+    cudf::util::round_up_safe(total_decomp_size, cudf::io::detail::BUFFER_PADDING_MULTIPLE),
+    stream);
 
   auto comp_in =
     cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
@@ -874,8 +882,11 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpuinflate(
-            d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+          gpuinflate(d_comp_in_view,
+                     d_comp_out_view,
+                     d_comp_res_view,
+                     cudf::io::detail::gzip_header_included::YES,
+                     stream);
         }
         break;
       case SNAPPY:
@@ -937,7 +948,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
     auto const d_copy_out = cudf::detail::make_device_uvector_async(
       copy_out, stream, cudf::get_current_device_resource_ref());
 
-    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    cudf::io::detail::gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
   }
 
@@ -1085,32 +1096,29 @@ struct get_decomp_scratch {
       case UNCOMPRESSED:
       case GZIP: return 0;
 
-      case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
+      case BROTLI: return cudf::io::detail::get_gpu_debrotli_scratch_size(di.num_pages);
 
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
-          return cudf::io::nvcomp::batched_decompress_temp_size(
-            cudf::io::nvcomp::compression_type::SNAPPY,
-            di.num_pages,
-            di.max_page_decompressed_size,
-            di.total_decompressed_size);
+          return nvcomp::batched_decompress_temp_size(nvcomp::compression_type::SNAPPY,
+                                                      di.num_pages,
+                                                      di.max_page_decompressed_size,
+                                                      di.total_decompressed_size);
         } else {
           return 0;
         }
         break;
 
       case ZSTD:
-        return cudf::io::nvcomp::batched_decompress_temp_size(
-          cudf::io::nvcomp::compression_type::ZSTD,
-          di.num_pages,
-          di.max_page_decompressed_size,
-          di.total_decompressed_size);
+        return nvcomp::batched_decompress_temp_size(nvcomp::compression_type::ZSTD,
+                                                    di.num_pages,
+                                                    di.max_page_decompressed_size,
+                                                    di.total_decompressed_size);
       case LZ4_RAW:
-        return cudf::io::nvcomp::batched_decompress_temp_size(
-          cudf::io::nvcomp::compression_type::LZ4,
-          di.num_pages,
-          di.max_page_decompressed_size,
-          di.total_decompressed_size);
+        return nvcomp::batched_decompress_temp_size(nvcomp::compression_type::LZ4,
+                                                    di.num_pages,
+                                                    di.max_page_decompressed_size,
+                                                    di.total_decompressed_size);
 
       default: CUDF_FAIL("Invalid compression codec for parquet decompression");
     }
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index bfd0cc992cf..0dd1aff41e9 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -23,7 +23,7 @@
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index bcdae4cbd3b..326232ced60 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -15,6 +15,7 @@
  */
 
 #include "error.hpp"
+#include "io/comp/common.hpp"
 #include "reader_impl.hpp"
 
 #include <cudf/detail/iterator.cuh>
@@ -251,8 +252,8 @@ void generate_depth_remappings(
       if (source->is_device_read_preferred(io_size)) {
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
-        page_data[chunk] =
-          rmm::device_buffer(cudf::util::round_up_safe(io_size, BUFFER_PADDING_MULTIPLE), stream);
+        page_data[chunk] = rmm::device_buffer(
+          cudf::util::round_up_safe(io_size, cudf::io::detail::BUFFER_PADDING_MULTIPLE), stream);
         auto fut_read_size = source->device_read_async(
           io_offset, io_size, static_cast<uint8_t*>(page_data[chunk].data()), stream);
         read_tasks.emplace_back(std::move(fut_read_size));
@@ -261,7 +262,8 @@ void generate_depth_remappings(
         // Buffer needs to be padded.
         // Required by `gpuDecodePageData`.
         page_data[chunk] = rmm::device_buffer(
-          cudf::util::round_up_safe(read_buffer->size(), BUFFER_PADDING_MULTIPLE), stream);
+          cudf::util::round_up_safe(read_buffer->size(), cudf::io::detail::BUFFER_PADDING_MULTIPLE),
+          stream);
         CUDF_CUDA_TRY(cudaMemcpyAsync(page_data[chunk].data(),
                                       read_buffer->data(),
                                       read_buffer->size(),
@@ -550,7 +552,7 @@ void decode_page_headers(pass_intermediate_data& pass,
 {
   CUDF_FUNC_RANGE();
 
-  auto iter = thrust::make_counting_iterator(0);
+  auto iter = thrust::counting_iterator<size_t>(0);
   rmm::device_uvector<size_t> chunk_page_counts(pass.chunks.size() + 1, stream);
   thrust::transform_exclusive_scan(
     rmm::exec_policy_nosync(stream),
@@ -562,7 +564,7 @@ void decode_page_headers(pass_intermediate_data& pass,
         return static_cast<size_t>(
           i >= num_chunks ? 0 : chunks[i].num_data_pages + chunks[i].num_dict_pages);
       }),
-    0,
+    size_t{0},
     thrust::plus<size_t>{});
   rmm::device_uvector<chunk_page_info> d_chunk_page_info(pass.chunks.size(), stream);
   thrust::for_each(rmm::exec_policy_nosync(stream),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index f865c9a7643..77924ac0f35 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "interop/decimal_conversion_utilities.cuh"
+#include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
@@ -38,10 +39,10 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
@@ -718,7 +719,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
             // all others
             default:
               CUDF_LOG_WARN(
-                "Unsupported page encoding requested: {}; the requested encoding will be ignored",
+                "Unsupported page encoding requested: %d; the requested encoding will be ignored",
                 static_cast<int>(col_meta.get_encoding()));
               return;
           }
@@ -1302,7 +1303,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     } else {
       chunk.use_dictionary = true;
       chunk.dict_map_size =
-        static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
+        static_cast<cudf::size_type>(cuco::make_bucket_extent<map_cg_size, bucket_size>(
           static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
       chunk.dict_map_offset = total_map_storage_size;
       total_map_storage_size += chunk.dict_map_size;
@@ -1317,7 +1318,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     total_map_storage_size,
     cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
   // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
-  device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
+  device_span<bucket_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
   // Synchronize
   chunks.host_to_device_async(stream);
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index 396d44c0763..f15ea1f3c37 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -21,6 +21,8 @@
 
 #include "writer_impl_helpers.hpp"
 
+#include "io/comp/nvcomp_adapter.hpp"
+
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
index a85411594e9..14a9a0ed5b7 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.hpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -20,11 +20,11 @@
  */
 
 #pragma once
-#include "io/comp/nvcomp_adapter.hpp"
 #include "parquet_common.hpp"
 
 #include <cudf/detail/utilities/linked_column.hpp>
 #include <cudf/io/detail/parquet.hpp>
+#include <cudf/io/nvcomp_adapter.hpp>
 
 namespace cudf::io::parquet::detail {
 
@@ -42,7 +42,7 @@ Compression to_parquet_compression(compression_type compression);
  * @param codec Compression codec
  * @return Translated nvcomp compression type
  */
-nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
+cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
 
 /**
  * @brief Function that computes input alignment requirements for the given compression type.
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 58698c6a19d..50d823ade88 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/span.hpp>
 
+#include <cuda/std/limits>
+
 namespace cudf::io::statistics {
 
 /**
@@ -30,15 +32,19 @@ class byte_array_view {
  public:
   using element_type = std::byte const;  ///< The type of the elements in the byte array
 
-  constexpr byte_array_view() noexcept {}
+  CUDF_HOST_DEVICE constexpr byte_array_view() noexcept {}
   /**
    * @brief Constructs a byte_array_view from a pointer and a size.
    *
    * @param data Pointer to the first element in the byte array.
    * @param size The number of elements in the byte array.
    */
-  constexpr byte_array_view(element_type* data, std::size_t size) : _data(data, size) {}
-  constexpr byte_array_view(byte_array_view const&) noexcept = default;  ///< Copy constructor
+  CUDF_HOST_DEVICE constexpr byte_array_view(element_type* data, std::size_t size)
+    : _data(data, size)
+  {
+  }
+  CUDF_HOST_DEVICE constexpr byte_array_view(byte_array_view const&) noexcept =
+    default;  ///< Copy constructor
   /**
    * @brief Copy assignment operator.
    *
@@ -55,14 +61,20 @@ class byte_array_view {
    * @param idx The index of the element to access.
    * @return A reference to the idx-th element of the byte_array_view, i.e., `_data.data()[idx]`.
    */
-  [[nodiscard]] constexpr element_type& operator[](std::size_t idx) const { return _data[idx]; }
+  [[nodiscard]] __device__ constexpr element_type& operator[](std::size_t idx) const
+  {
+    return _data[idx];
+  }
 
   /**
    * @brief Returns a pointer to the beginning of the byte_array_view.
    *
    * @return A pointer to the first element of the byte_array_view.
    */
-  [[nodiscard]] constexpr element_type* data() const noexcept { return _data.data(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr element_type* data() const noexcept
+  {
+    return _data.data();
+  }
 
   /**
    * @brief Returns the number of elements in the byte_array_view.
@@ -76,7 +88,10 @@ class byte_array_view {
    *
    * @return The size of the byte_array_view in bytes
    */
-  [[nodiscard]] constexpr std::size_t size_bytes() const noexcept { return _data.size_bytes(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr std::size_t size_bytes() const noexcept
+  {
+    return _data.size_bytes();
+  }
 
   /**
    * @brief Comparing target byte_array_view with this byte_array_view. Each byte in the array is
@@ -98,9 +113,9 @@ class byte_array_view {
     auto const* ptr2 = rhs.data();
     if ((ptr1 == ptr2) && (len1 == len2)) { return 0; }
     // if I am max, I am greater than the argument
-    if (ptr1 == nullptr && len1 == std::numeric_limits<std::size_t>::max()) { return 1; }
+    if (ptr1 == nullptr && len1 == cuda::std::numeric_limits<std::size_t>::max()) { return 1; }
     // if the argument is max, it is greater than me
-    if (ptr2 == nullptr && len2 == std::numeric_limits<std::size_t>::max()) { return -1; }
+    if (ptr2 == nullptr && len2 == cuda::std::numeric_limits<std::size_t>::max()) { return -1; }
     std::size_t idx = 0;
     for (; (idx < len1) && (idx < len2); ++idx) {
       if (ptr1[idx] != ptr2[idx]) {
@@ -170,7 +185,7 @@ class byte_array_view {
    */
   [[nodiscard]] __device__ inline static byte_array_view max()
   {
-    return {nullptr, std::numeric_limits<std::size_t>::max()};
+    return {nullptr, cuda::std::numeric_limits<std::size_t>::max()};
   }
 
  private:
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index 01db781c766..dc023e69423 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -30,6 +30,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
+#include <cuda/std/limits>
 #include <math_constants.h>
 #include <thrust/extrema.h>
 
@@ -246,9 +247,9 @@ get_untyped_chunk(typed_statistics_chunk<T, include_aggregate> const& chunk)
     // invalidate the sum if overflow or underflow is possible
     if constexpr (std::is_floating_point_v<E> or std::is_integral_v<E>) {
       if (!chunk.has_minmax) { return true; }
-      return std::numeric_limits<E>::max() / chunk.non_nulls >=
+      return cuda::std::numeric_limits<E>::max() / chunk.non_nulls >=
                static_cast<E>(chunk.maximum_value) and
-             std::numeric_limits<E>::lowest() / chunk.non_nulls <=
+             cuda::std::numeric_limits<E>::lowest() / chunk.non_nulls <=
                static_cast<E>(chunk.minimum_value);
     }
     return true;
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 06069630685..162da62ef03 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
 
@@ -41,6 +42,8 @@
 namespace cudf::io::text {
 namespace {
 
+namespace nvcomp = cudf::io::detail::nvcomp;
+
 /**
  * @brief Transforms offset tuples of the form [compressed_begin, compressed_end,
  * decompressed_begin, decompressed_end] into span tuples of the form [compressed_device_span,
@@ -73,7 +76,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
   {
     // Buffer needs to be padded.
     // Required by `inflate_kernel`.
-    device.resize(cudf::util::round_up_safe(host.size(), BUFFER_PADDING_MULTIPLE), stream);
+    device.resize(cudf::util::round_up_safe(host.size(), cudf::io::detail::BUFFER_PADDING_MULTIPLE),
+                  stream);
     cudf::detail::cuda_memcpy_async<T>(
       device_span<T>{device}.subspan(0, host.size()), host, stream);
   }
@@ -94,7 +98,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     rmm::device_uvector<std::size_t> d_decompressed_offsets;
     rmm::device_uvector<device_span<uint8_t const>> d_compressed_spans;
     rmm::device_uvector<device_span<uint8_t>> d_decompressed_spans;
-    rmm::device_uvector<compression_result> d_decompression_results;
+    rmm::device_uvector<cudf::io::detail::compression_result> d_decompression_results;
     std::size_t compressed_size_with_headers{};
     std::size_t max_decompressed_size{};
     // this is usually equal to decompressed_size()
@@ -152,16 +156,16 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
           gpuinflate(d_compressed_spans,
                      d_decompressed_spans,
                      d_decompression_results,
-                     gzip_header_included::NO,
+                     cudf::io::detail::gzip_header_included::NO,
                      stream);
         } else {
-          cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE,
-                                               d_compressed_spans,
-                                               d_decompressed_spans,
-                                               d_decompression_results,
-                                               max_decompressed_size,
-                                               decompressed_size(),
-                                               stream);
+          nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
+                                     d_compressed_spans,
+                                     d_decompressed_spans,
+                                     d_decompression_results,
+                                     max_decompressed_size,
+                                     decompressed_size(),
+                                     stream);
         }
       }
       is_decompressed = true;
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
index 2a2a07afc8d..00fc54f9883 100644
--- a/cpp/src/io/utilities/base64_utilities.cpp
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -60,7 +60,7 @@
 
 #include "base64_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index bed03869b34..975206646c6 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 
 #include "file_io_utilities.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <kvikio/file_handle.hpp>
@@ -44,7 +44,7 @@ class file_sink : public data_sink {
     if (cufile_integration::is_kvikio_enabled()) {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
-      CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
+      CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode %s.",
                     _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_out = detail::make_cufile_output(filepath);
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 62ef7c7a794..87b3c6facdf 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,11 @@
 #include "file_io_utilities.hpp"
 #include "getenv_or.hpp"
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -55,7 +55,7 @@ class file_source : public datasource {
     if (cufile_integration::is_kvikio_enabled()) {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath);
-      CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
+      CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.",
                     _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_in = detail::make_cufile_input(filepath);
@@ -230,7 +230,7 @@ class memory_mapped_source : public file_source {
   {
     if (_map_addr != nullptr) {
       auto const result = munmap(_map_addr, _map_size);
-      if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); }
+      if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); }
       _map_addr = nullptr;
     }
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9b17e7f6d55..28367c95430 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -19,10 +19,11 @@
 #include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/logger.hpp>
 
 #include <dlfcn.h>
+#include <sys/stat.h>
 
 #include <cerrno>
 #include <cstring>
diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp
index 3fd97a00b61..acfd2221797 100644
--- a/cpp/src/io/utilities/getenv_or.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/logger.hpp>
 
 #include <cstdlib>
 #include <sstream>
@@ -32,10 +32,17 @@ T getenv_or(std::string_view env_var_name, T default_val)
 {
   auto const env_val = std::getenv(env_var_name.data());
   if (env_val != nullptr) {
-    CUDF_LOG_INFO("Environment variable {} read as {}", env_var_name, env_val);
+    CUDF_LOG_INFO("Environment variable %.*s read as %s",
+                  static_cast<int>(env_var_name.length()),
+                  env_var_name.data(),
+                  env_val);
   } else {
-    CUDF_LOG_INFO(
-      "Environment variable {} is not set, using default value {}", env_var_name, default_val);
+    std::stringstream ss;
+    ss << default_val;
+    CUDF_LOG_INFO("Environment variable %.*s is not set, using default value %s",
+                  static_cast<int>(env_var_name.length()),
+                  env_var_name.data(),
+                  ss.str());
   }
 
   if (env_val == nullptr) { return default_val; }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 734067582f7..75e45a68842 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,12 +30,11 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
 
-#include <optional>
-
 using cudf::device_span;
 
 namespace cudf {
@@ -183,7 +182,7 @@ constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' -
  * @param end Pointer to the first element after the string
  * @return true if string is valid infinity, else false.
  */
-constexpr bool is_infinity(char const* begin, char const* end)
+CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
 {
   if (*begin == '-' || *begin == '+') begin++;
   char const* cinf = "infinity";
@@ -208,9 +207,9 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ std::optional<T> parse_numeric(char const* begin,
-                                                   char const* end,
-                                                   parse_options_view const& opts)
+__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
+                                                         char const* end,
+                                                         parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -267,7 +266,7 @@ __host__ __device__ std::optional<T> parse_numeric(char const* begin,
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return std::optional<T>{}; }
+  if (!all_digits_valid) { return cuda::std::optional<T>{}; }
 
   return value * sign;
 }
@@ -524,7 +523,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
+    auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
@@ -573,7 +572,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
@@ -602,7 +601,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    auto const value = [&opts, begin, end]() -> std::optional<T> {
+    auto const value = [&opts, begin, end]() -> cuda::std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
diff --git a/cpp/src/io/utilities/time_utils.cuh b/cpp/src/io/utilities/time_utils.cuh
index 687766c1bcc..ff1b9f58e6c 100644
--- a/cpp/src/io/utilities/time_utils.cuh
+++ b/cpp/src/io/utilities/time_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@ static const __device__ __constant__ int32_t powers_of_ten[10] = {
 
 struct get_period {
   template <typename T>
-  constexpr int32_t operator()()
+  int32_t operator()()
   {
     if constexpr (is_chrono<T>()) { return T::period::den; }
     CUDF_FAIL("Invalid, non chrono type");
@@ -42,7 +42,7 @@ struct get_period {
 /**
  * @brief Function that translates cuDF time unit to clock frequency
  */
-constexpr int32_t to_clockrate(type_id timestamp_type_id)
+inline int32_t to_clockrate(type_id timestamp_type_id)
 {
   return timestamp_type_id == type_id::EMPTY
            ? 0
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index caea8dabb88..c0efc5b6f20 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -82,8 +82,8 @@ CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
  *
  * @return Boolean value; true if string is found, false otherwise
  */
-__host__ __device__ inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
-                                                         device_span<char const> key)
+CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span<serial_trie_node const> trie,
+                                                      device_span<char const> key)
 {
   if (trie.empty()) { return false; }
   if (key.empty()) { return trie.front().is_leaf; }
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index ce4d2067b82..d1a01ee76e4 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,28 +47,19 @@ namespace cudf {
 namespace detail {
 namespace {
 
-template <cudf::has_nested HasNested>
-auto prepare_device_equal(
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> probe,
-  bool has_nulls,
-  cudf::null_equality compare_nulls)
-{
-  auto const two_table_equal =
-    cudf::experimental::row::equality::two_table_comparator(probe, build);
-  return comparator_adapter{two_table_equal.equal_to<HasNested == cudf::has_nested::YES>(
-    nullate::DYNAMIC{has_nulls}, compare_nulls)};
-}
+bool constexpr has_nulls = true;  ///< Always has nulls
 
 /**
  * @brief Device functor to create a pair of {hash_value, row_index} for a given row.
- *
- * @tparam Hasher The type of internal hasher to compute row hash.
  */
-template <typename Hasher, typename T>
+template <typename T>
 class build_keys_fn {
+  using hasher =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
  public:
-  CUDF_HOST_DEVICE build_keys_fn(Hasher const& hash) : _hash{hash} {}
+  CUDF_HOST_DEVICE constexpr build_keys_fn(hasher const& hash) : _hash{hash} {}
 
   __device__ __forceinline__ auto operator()(size_type i) const noexcept
   {
@@ -76,7 +67,7 @@ class build_keys_fn {
   }
 
  private:
-  Hasher _hash;
+  hasher _hash;
 };
 
 /**
@@ -92,26 +83,19 @@ struct output_fn {
 };
 }  // namespace
 
-template <cudf::has_nested HasNested>
-distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
-                                                  cudf::table_view const& probe,
-                                                  bool has_nulls,
-                                                  cudf::null_equality compare_nulls,
-                                                  rmm::cuda_stream_view stream)
-  : _has_nulls{has_nulls},
+distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+                                       cudf::null_equality compare_nulls,
+                                       rmm::cuda_stream_view stream)
+  : _has_nested_columns{cudf::has_nested_columns(build)},
     _nulls_equal{compare_nulls},
     _build{build},
-    _probe{probe},
     _preprocessed_build{
       cudf::experimental::row::equality::preprocessed_table::create(_build, stream)},
-    _preprocessed_probe{
-      cudf::experimental::row::equality::preprocessed_table::create(_probe, stream)},
     _hash_table{build.num_rows(),
                 CUCO_DESIRED_LOAD_FACTOR,
                 cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
                                            rhs_index_type{JoinNoneValue}}},
-                prepare_device_equal<HasNested>(
-                  _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls),
+                always_not_equal{},
                 {},
                 cuco::thread_scope_device,
                 cuco_storage_type{},
@@ -124,10 +108,10 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   if (this->_build.num_rows() == 0) { return; }
 
   auto const row_hasher = experimental::row::hash::row_hasher{this->_preprocessed_build};
-  auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
+  auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
 
-  auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_hasher), rhs_index_type>{d_hasher});
+  auto const iter =
+    cudf::detail::make_counting_transform_iterator(0, build_keys_fn<rhs_index_type>{d_hasher});
 
   size_type const build_table_num_rows{build.num_rows()};
   if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) {
@@ -146,15 +130,15 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   }
 }
 
-template <cudf::has_nested HasNested>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
-                                          rmm::device_async_resource_ref mr) const
+distinct_hash_join::inner_join(cudf::table_view const& probe,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
-  size_type const probe_table_num_rows{this->_probe.num_rows()};
+  size_type const probe_table_num_rows{probe.num_rows()};
 
   // If output size is zero, return immediately
   if (probe_table_num_rows == 0) {
@@ -162,25 +146,62 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
+  auto preprocessed_probe =
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_probe, _preprocessed_build);
+
   auto build_indices =
     std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
   auto probe_indices =
     std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
 
-  auto const probe_row_hasher =
-    cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
-  auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
-  auto const iter           = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
+  auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
+  auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+  auto const iter             = cudf::detail::make_counting_transform_iterator(
+    0, build_keys_fn<lhs_index_type>{d_probe_hasher});
 
   auto found_indices = rmm::device_uvector<size_type>(probe_table_num_rows, stream);
   auto const found_begin =
     thrust::make_transform_output_iterator(found_indices.begin(), output_fn{});
 
-  // TODO conditional find for nulls once `cuco::static_set::find_if` is added
-  // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal
-  // to `JoinNoneValue`, then `idx` has a match in the hash set.
-  this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value());
+  auto const comparator_helper = [&](auto device_comparator) {
+    // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not
+    // equal to `JoinNoneValue`, then `idx` has a match in the hash set.
+    if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) {
+      this->_hash_table.find_async(iter,
+                                   iter + probe_table_num_rows,
+                                   comparator_adapter{device_comparator},
+                                   hasher{},
+                                   found_begin,
+                                   stream.value());
+    } else {
+      auto stencil = thrust::counting_iterator<size_type>{0};
+      auto const row_bitmask =
+        cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first;
+      auto const pred =
+        cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
+
+      this->_hash_table.find_if_async(iter,
+                                      iter + probe_table_num_rows,
+                                      stencil,
+                                      pred,
+                                      comparator_adapter{device_comparator},
+                                      hasher{},
+                                      found_begin,
+                                      stream.value());
+    }
+  };
+
+  if (_has_nested_columns) {
+    auto const device_comparator =
+      two_table_equal.equal_to<true>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+    comparator_helper(device_comparator);
+  } else {
+    auto const device_comparator =
+      two_table_equal.equal_to<false>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+    comparator_helper(device_comparator);
+  }
 
   auto const tuple_iter = cudf::detail::make_counting_transform_iterator(
     0,
@@ -203,16 +224,17 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 
-  return {std::move(build_indices), std::move(probe_indices)};
+  return {std::move(probe_indices), std::move(build_indices)};
 }
 
-template <cudf::has_nested HasNested>
-std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::left_join(
-  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
+std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
+  cudf::table_view const& probe,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
-  size_type const probe_table_num_rows{this->_probe.num_rows()};
+  size_type const probe_table_num_rows{probe.num_rows()};
 
   // If output size is zero, return empty
   if (probe_table_num_rows == 0) {
@@ -227,80 +249,82 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::l
     thrust::fill(
       rmm::exec_policy_nosync(stream), build_indices->begin(), build_indices->end(), JoinNoneValue);
   } else {
-    auto const probe_row_hasher =
-      cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
-    auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
-    auto const iter           = cudf::detail::make_counting_transform_iterator(
-      0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
+    auto preprocessed_probe =
+      cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+    auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+      preprocessed_probe, _preprocessed_build);
+
+    auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
+    auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+    auto const iter             = cudf::detail::make_counting_transform_iterator(
+      0, build_keys_fn<lhs_index_type>{d_probe_hasher});
 
     auto const output_begin =
       thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
-    // TODO conditional find for nulls once `cuco::static_set::find_if` is added
-    this->_hash_table.find_async(iter, iter + probe_table_num_rows, output_begin, stream.value());
+    auto const comparator_helper = [&](auto device_comparator) {
+      if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) {
+        this->_hash_table.find_async(iter,
+                                     iter + probe_table_num_rows,
+                                     comparator_adapter{device_comparator},
+                                     hasher{},
+                                     output_begin,
+                                     stream.value());
+      } else {
+        auto stencil = thrust::counting_iterator<size_type>{0};
+        auto const row_bitmask =
+          cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first;
+        auto const pred =
+          cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
+
+        this->_hash_table.find_if_async(iter,
+                                        iter + probe_table_num_rows,
+                                        stencil,
+                                        pred,
+                                        comparator_adapter{device_comparator},
+                                        hasher{},
+                                        output_begin,
+                                        stream.value());
+      }
+    };
+
+    if (_has_nested_columns) {
+      auto const device_comparator =
+        two_table_equal.equal_to<true>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+      comparator_helper(device_comparator);
+    } else {
+      auto const device_comparator =
+        two_table_equal.equal_to<false>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+      comparator_helper(device_comparator);
+    }
   }
 
   return build_indices;
 }
 }  // namespace detail
 
-template <>
-distinct_hash_join<cudf::has_nested::YES>::~distinct_hash_join() = default;
-
-template <>
-distinct_hash_join<cudf::has_nested::NO>::~distinct_hash_join() = default;
-
-template <>
-distinct_hash_join<cudf::has_nested::YES>::distinct_hash_join(cudf::table_view const& build,
-                                                              cudf::table_view const& probe,
-                                                              nullable_join has_nulls,
-                                                              null_equality compare_nulls,
-                                                              rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<impl_type>(
-      build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)}
-{
-}
-
-template <>
-distinct_hash_join<cudf::has_nested::NO>::distinct_hash_join(cudf::table_view const& build,
-                                                             cudf::table_view const& probe,
-                                                             nullable_join has_nulls,
-                                                             null_equality compare_nulls,
-                                                             rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<impl_type>(
-      build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)}
-{
-}
+distinct_hash_join::~distinct_hash_join() = default;
 
-template <>
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<cudf::has_nested::YES>::inner_join(rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr) const
+distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+                                       null_equality compare_nulls,
+                                       rmm::cuda_stream_view stream)
+  : _impl{std::make_unique<impl_type>(build, compare_nulls, stream)}
 {
-  return _impl->inner_join(stream, mr);
 }
 
-template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr) const
-{
-  return _impl->inner_join(stream, mr);
-}
-
-template <>
-std::unique_ptr<rmm::device_uvector<size_type>>
-distinct_hash_join<cudf::has_nested::YES>::left_join(rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr) const
+distinct_hash_join::inner_join(cudf::table_view const& probe,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(stream, mr);
+  return _impl->inner_join(probe, stream, mr);
 }
 
-template <>
-std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<cudf::has_nested::NO>::left_join(
-  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
+std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
+  cudf::table_view const& probe,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(stream, mr);
+  return _impl->left_join(probe, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 4f75908fe72..37c5698f654 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 368b1fba870..4565626edad 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,8 +37,6 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <cudf::size_type block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   mixed_join(table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index a4ec97af235..4c063b6202e 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,8 +30,6 @@ namespace detail {
 
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <cudf::size_type block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   mixed_join_semi(table_device_view left_table,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 4049ccf35e1..869d05ce4d3 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,8 +34,6 @@ namespace cudf {
 namespace detail {
 namespace cg = cooperative_groups;
 
-#pragma GCC diagnostic ignored "-Wattributes"
-
 template <int block_size, bool has_nulls>
 CUDF_KERNEL void __launch_bounds__(block_size)
   compute_mixed_join_output_size(table_device_view left_table,
@@ -62,8 +60,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
     intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
 
   std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
+  auto const start_idx                 = cudf::detail::grid_1d::global_thread_id();
+  auto const stride                    = cudf::detail::grid_1d::grid_stride();
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
   auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
@@ -80,7 +78,7 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   auto count_equality = pair_expression_equality<has_nulls>{
     evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+  for (auto outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     auto query_pair = pair_func(outer_row_index);
     if (join_type == join_kind::LEFT_JOIN || join_type == join_kind::FULL_JOIN) {
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index fd8629ed6f3..e6e01b9c9fe 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -928,7 +928,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
                               get_json_object_options options)
 {
   auto tid          = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::thread_index_type{blockDim.x} * cudf::thread_index_type{gridDim.x};
+  auto const stride = cudf::detail::grid_1d::grid_stride();
 
   size_type warp_valid_count{0};
 
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index ebab3beb08f..d6b85db3f0f 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -138,7 +138,7 @@ CUDF_KERNEL void compute_row_partition_numbers(row_hasher_t the_hasher,
   auto const stride = cudf::detail::grid_1d::grid_stride();
 
   // Initialize local histogram
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_sizes[partition_number] = 0;
     partition_number += blockDim.x;
@@ -207,7 +207,7 @@ CUDF_KERNEL void compute_row_output_locations(size_type* __restrict__ row_partit
   extern __shared__ size_type shared_partition_offsets[];
 
   // Initialize array of this blocks offsets from global array
-  size_type partition_number = threadIdx.x;
+  thread_index_type partition_number = threadIdx.x;
   while (partition_number < num_partitions) {
     shared_partition_offsets[partition_number] =
       block_partition_offsets[partition_number * gridDim.x + blockIdx.x];
@@ -303,7 +303,8 @@ CUDF_KERNEL void copy_block_partitions(InputIter input_iter,
 
   // Fetch the offset in the output buffer of each partition in this thread
   // block
-  for (size_type ipartition = threadIdx.x; ipartition < num_partitions; ipartition += blockDim.x) {
+  for (thread_index_type ipartition = threadIdx.x; ipartition < num_partitions;
+       ipartition += blockDim.x) {
     partition_offset_global[ipartition] =
       scanned_block_partition_sizes[ipartition * gridDim.x + blockIdx.x];
   }
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 47864c25c5f..a60cbbb8db2 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -20,7 +20,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <cmath>
+#include <cuda/std/cmath>
+#include <cuda/std/functional>
 
 namespace cudf {
 namespace detail {
@@ -96,12 +97,12 @@ struct quantile_index {
 
   CUDF_HOST_DEVICE inline quantile_index(size_type count, double quantile)
   {
-    quantile = std::min(std::max(quantile, 0.0), 1.0);
+    quantile = cuda::std::min(cuda::std::max(quantile, 0.0), 1.0);
 
     double val = quantile * (count - 1);
     lower      = std::floor(val);
-    higher     = static_cast<size_type>(std::ceil(val));
-    nearest    = static_cast<size_type>(std::nearbyint(val));
+    higher     = static_cast<size_type>(cuda::std::ceil(val));
+    nearest    = static_cast<size_type>(cuda::std::nearbyint(val));
     fraction   = val - lower;
   }
 };
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d27420658d6..2128bacff80 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -385,7 +385,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 size_type const* group_cluster_offsets,
                                                 bool has_nulls)
 {
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
 
   auto const group_index = tid;
   if (group_index >= num_groups) { return; }
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 98fd9f679c8..21d8c95e199 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -218,9 +218,8 @@ struct minmax_functor {
     auto dev_result = reduce<cudf::string_view>(col, stream);
     // copy the minmax_pair to the host; does not copy the strings
     using OutputType = minmax_pair<cudf::string_view>;
-    OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
+
+    auto const host_result = dev_result.value(stream);
     // strings are copied to create the scalars here
     return {std::make_unique<string_scalar>(host_result.min_val, true, stream, mr),
             std::make_unique<string_scalar>(host_result.max_val, true, stream, mr)};
@@ -236,10 +235,8 @@ struct minmax_functor {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
     // copy the minmax_pair to the host to call get_element
-    using OutputType = minmax_pair<T>;
-    OutputType host_result;
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      &host_result, dev_result.data(), sizeof(OutputType), cudaMemcpyDefault, stream.value()));
+    using OutputType       = minmax_pair<T>;
+    OutputType host_result = dev_result.value(stream);
     // get the keys for those indexes
     auto const keys = dictionary_column_view(col).keys();
     return {detail::get_element(keys, static_cast<size_type>(host_result.min_val), stream, mr),
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 4ec2174a96f..4b0b08fe251 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -114,11 +114,10 @@ string_scalar::operator std::string() const { return this->to_string(cudf::get_d
 
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
-  std::string result;
-  result.resize(_data.size());
-  CUDF_CUDA_TRY(
-    cudaMemcpyAsync(&result[0], _data.data(), _data.size(), cudaMemcpyDefault, stream.value()));
-  stream.synchronize();
+  std::string result(size(), '\0');
+  detail::cuda_memcpy(host_span<char>{result.data(), result.size()},
+                      device_span<char const>{data(), _data.size()},
+                      stream);
   return result;
 }
 
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 7d11b02d3e1..9ab8ed5938a 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -95,8 +95,8 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
 
   auto const helper_func = [&](auto const& d_equal) {
-    using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{
+    using RowEqual = std::decay_t<decltype(d_equal)>;
+    auto set       = distinct_set_t<RowEqual>{
       num_rows,
       0.5,  // desired load factor
       cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index c3a004b7f28..aadb438b019 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -21,8 +21,8 @@
 
 namespace cudf::detail {
 
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,
@@ -100,7 +100,7 @@ rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
 }
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -110,7 +110,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
@@ -120,7 +120,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     false,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
@@ -130,7 +130,7 @@ template rmm::device_uvector<size_type> reduce_by_row(
   rmm::device_async_resource_ref mr);
 
 template rmm::device_uvector<size_type> reduce_by_row(
-  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+  distinct_set_t<cudf::experimental::row::equality::device_row_comparator<
     true,
     cudf::nullate::DYNAMIC,
     cudf::experimental::row::equality::physical_equality_comparator>>& set,
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index f15807c2434..4ca1cab937a 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -47,12 +47,12 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
-template <typename RowHasher>
-using hash_set_type =
+template <typename RowEqual>
+using distinct_set_t =
   cuco::static_set<size_type,
                    cuco::extent<int64_t>,
                    cuda::thread_scope_device,
-                   RowHasher,
+                   RowEqual,
                    cuco::linear_probing<1,
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
@@ -79,6 +79,8 @@ using hash_set_type =
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
+ * @tparam RowEqual The type of row equality comparator
+ *
  * @param set The auxiliary set to perform reduction
  * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
@@ -87,8 +89,8 @@ using hash_set_type =
  * @param mr Device memory resource used to allocate the returned vector
  * @return A device_uvector containing the output indices
  */
-template <typename RowHasher>
-rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+template <typename RowEqual>
+rmm::device_uvector<size_type> reduce_by_row(distinct_set_t<RowEqual>& set,
                                              size_type num_rows,
                                              duplicate_keep_option keep,
                                              rmm::cuda_stream_view stream,
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 2df404048f7..d22fb04696c 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -186,7 +186,7 @@ class reprog_device {
    *            Specify -1 to match any virtual positions past the end of the string.
    * @return If match found, returns character positions of the matches.
    */
-  __device__ [[nodiscard]] inline match_result find(int32_t const thread_idx,
+  [[nodiscard]] __device__ inline match_result find(int32_t const thread_idx,
                                                     string_view const d_str,
                                                     string_view::const_iterator begin,
                                                     cudf::size_type end = -1) const;
@@ -205,7 +205,7 @@ class reprog_device {
    * @param group_id The specific group to return its matching position values.
    * @return If valid, returns the character position of the matched group in the given string,
    */
-  __device__ [[nodiscard]] inline match_result extract(int32_t const thread_idx,
+  [[nodiscard]] __device__ inline match_result extract(int32_t const thread_idx,
                                                        string_view const d_str,
                                                        string_view::const_iterator begin,
                                                        cudf::size_type end,
@@ -225,17 +225,17 @@ class reprog_device {
   /**
    * @brief Returns the regex instruction object for a given id.
    */
-  __device__ [[nodiscard]] inline reinst get_inst(int32_t id) const;
+  [[nodiscard]] __device__ inline reinst get_inst(int32_t id) const;
 
   /**
    * @brief Returns the regex class object for a given id.
    */
-  __device__ [[nodiscard]] inline reclass_device get_class(int32_t id) const;
+  [[nodiscard]] __device__ inline reclass_device get_class(int32_t id) const;
 
   /**
    * @brief Executes the regex pattern on the given string.
    */
-  __device__ [[nodiscard]] inline match_result regexec(string_view const d_str,
+  [[nodiscard]] __device__ inline match_result regexec(string_view const d_str,
                                                        reljunk jnk,
                                                        string_view::const_iterator begin,
                                                        cudf::size_type end,
@@ -244,7 +244,7 @@ class reprog_device {
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
-  __device__ [[nodiscard]] inline match_result call_regexec(
+  [[nodiscard]] __device__ inline match_result call_regexec(
     int32_t const thread_idx,
     string_view const d_str,
     string_view::const_iterator begin,
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index e34a1e12015..906f09e4d82 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -81,11 +81,11 @@ struct alignas(8) relist {
     return true;
   }
 
-  __device__ [[nodiscard]] __forceinline__ restate get_state(int16_t idx) const
+  [[nodiscard]] __device__ __forceinline__ restate get_state(int16_t idx) const
   {
     return restate{ranges[idx * stride], inst_ids[idx * stride]};
   }
-  __device__ [[nodiscard]] __forceinline__ int16_t get_size() const { return size; }
+  [[nodiscard]] __device__ __forceinline__ int16_t get_size() const { return size; }
 
  private:
   int16_t size{};
@@ -101,7 +101,7 @@ struct alignas(8) relist {
     mask[pos >> 3] |= uc;
   }
 
-  __device__ [[nodiscard]] __forceinline__ bool readMask(int32_t pos) const
+  [[nodiscard]] __device__ __forceinline__ bool readMask(int32_t pos) const
   {
     u_char const uc = mask[pos >> 3];
     return static_cast<bool>((uc >> (pos & 7)) & 1);
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index 3d11b641b3f..902e13fe75e 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -17,7 +17,9 @@
 #include "strings/regex/regcomp.h"
 #include "strings/regex/regex.cuh"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -66,10 +68,11 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
                        cudf::util::round_up_safe(classes_size, sizeof(char32_t));
 
   // allocate memory to store all the prog data in a flat contiguous buffer
-  std::vector<u_char> h_buffer(memsize);                        // copy everything into here;
-  auto h_ptr    = h_buffer.data();                              // this is our running host ptr;
-  auto d_buffer = new rmm::device_buffer(memsize, stream);      // output device memory;
-  auto d_ptr    = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
+  auto h_buffer =
+    cudf::detail::make_host_vector<u_char>(memsize, stream);  // copy everything into here;
+  auto h_ptr    = h_buffer.data();                            // this is our running host ptr;
+  auto d_buffer = new rmm::device_uvector<u_char>(memsize, stream);  // output device memory;
+  auto d_ptr    = d_buffer->data();                                  // running device pointer
 
   // create our device object; this is managed separately and returned to the caller
   auto* d_prog = new reprog_device(h_prog);
@@ -113,8 +116,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   d_prog->_prog_size = memsize + sizeof(reprog_device);
 
   // copy flat prog to device memory
-  CUDF_CUDA_TRY(
-    cudaMemcpyAsync(d_buffer->data(), h_buffer.data(), memsize, cudaMemcpyDefault, stream.value()));
+  cudf::detail::cuda_memcpy_async<u_char>(*d_buffer, h_buffer, stream);
 
   // build deleter to cleanup device memory
   auto deleter = [d_buffer](reprog_device* t) {
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 0f33fcb6fe1..4ed66622508 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -32,7 +32,9 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
@@ -141,7 +143,7 @@ CUDF_KERNEL void finder_warp_parallel_fn(column_device_view const d_strings,
     if (stop < 0) { return d_str.size_bytes(); }
     if (stop <= start) { return begin; }
     // we count from `begin` instead of recounting from the beginning of the string
-    return begin + std::get<0>(bytes_to_character_position(
+    return begin + cuda::std::get<0>(bytes_to_character_position(
                      string_view(d_str.data() + begin, d_str.size_bytes() - begin), stop - start));
   }();
 
@@ -347,13 +349,15 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
                                            string_view const d_target,
                                            bool* d_results)
 {
-  auto const idx    = cudf::detail::grid_1d::global_thread_id();
-  using warp_reduce = cub::WarpReduce<bool>;
-  __shared__ typename warp_reduce::TempStorage temp_storage;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
   auto const str_idx = idx / cudf::detail::warp_size;
   if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  namespace cg        = cooperative_groups;
+  auto const warp     = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+  auto const lane_idx = warp.thread_rank();
+
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
@@ -373,7 +377,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
     }
   }
 
-  auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
+  auto const result = warp.any(found);
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 4c39fc96397..a74b19aae28 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -35,6 +35,7 @@
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
+#include <cuda/std/utility>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -140,14 +141,16 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
-      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+      first_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
     stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
-      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+      last_byte +=
+        cuda::std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
     }
 
     d_output[str_idx] = (first_byte < last_byte)
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index b04e9961e01..b5063931485 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
   if (str_length == 0) return tgt_length;
   if (tgt_length == 0) return str_length;
 
-  auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
-  auto itr   = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
-  // .first is min and .second is max
-  auto const [n, m] = std::minmax(str_length, tgt_length);
+  auto begin   = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
+  auto itr     = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
+  auto const n = cuda::std::min(str_length, tgt_length);
+  auto const m = cuda::std::max(str_length, tgt_length);
   // setup compute buffer pointers
   auto v0 = buffer;
   auto v1 = v0 + n + 1;
@@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
       auto sub_cost = v0[j] + (*itr != *itr_tgt);
       auto del_cost = v0[j + 1] + 1;
       auto ins_cost = v1[j] + 1;
-      v1[j + 1]     = std::min(std::min(sub_cost, del_cost), ins_cost);
+      v1[j + 1]     = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost);
     }
     thrust::swap(v0, v1);
   }
@@ -170,7 +171,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                      ? d_targets.element<cudf::string_view>(0)
                                      : d_targets.element<cudf::string_view>(idx);
                       // just need 2 integers for each character of the shorter string
-                      return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
+                      return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2;
                     });
 
   // get the total size of the temporary compute buffer
@@ -241,7 +242,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
       if (d_str1.empty() || d_str2.empty()) { return; }
       // the temp size needed is 2 integers per character of the shorter string
       d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
-        (std::min(d_str1.length(), d_str2.length()) + 1) * 2;
+        (cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2;
     });
 
   // get the total size for the compute buffer
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 2de94a4eb59..247440212d0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string
     }
   }
   auto const char_count = warp_reduce(temp_storage).Sum(count);
-  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); }
 }
 
 /**
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index b7a719a2041..9ce17c36b1f 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,130 +40,17 @@
 
 #include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/limits>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-#include <limits>
-
 namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Compute the minhash of each string for each seed
- *
- * This is a warp-per-string algorithm where parallel threads within a warp
- * work on substrings of a single string row.
- *
- * @tparam HashFunction hash function to use on each substring
- *
- * @param d_strings Strings column to process
- * @param seeds Seeds for hashing each string
- * @param width Substring window size in characters
- * @param d_hashes Minhash output values for each string
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
-                                cudf::device_span<hash_value_type const> seeds,
-                                cudf::size_type width,
-                                hash_value_type* d_hashes)
-{
-  auto const idx = cudf::detail::grid_1d::global_thread_id();
-
-  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-  if (str_idx >= d_strings.size()) { return; }
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-
-  if (d_strings.is_null(str_idx)) { return; }
-
-  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const d_output = d_hashes + (str_idx * seeds.size());
-
-  // initialize hashes output for this string
-  if (lane_idx == 0) {
-    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  auto const begin = d_str.data() + lane_idx;
-  auto const end   = d_str.data() + d_str.size_bytes();
-
-  // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
-    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
-    auto const check_str =  // used for counting 'width' characters
-      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
-    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
-    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
-
-    auto const hash_str = cudf::string_view(itr, bytes);
-    // hashing with each seed on the same section of the string is 10x faster than
-    // computing the substrings for each seed
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash substring and store the min value
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        auto const hvalue = hasher(hash_str);
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        auto const hvalue = thrust::get<0>(hasher(hash_str));
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
-      }
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
-                                         cudf::device_span<hash_value_type const> seeds,
-                                         cudf::size_type width,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS(width >= 2,
-               "Parameter width should be an integer value of 2 or greater",
-               std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, d_hashes);
-
-  return hashes;
-}
-
 constexpr cudf::thread_index_type block_size = 256;
 // for potentially tuning minhash_seed_kernel independently from block_size
 constexpr cudf::thread_index_type tile_size = block_size;
@@ -268,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     // initialize the output -- only needed for wider strings
     auto d_output = d_results + (str_idx * param_count);
     for (auto i = lane_idx; i < param_count; i += tile_size) {
-      d_output[i] = std::numeric_limits<hash_value_type>::max();
+      d_output[i] = cuda::std::numeric_limits<hash_value_type>::max();
     }
   }
 }
@@ -297,13 +184,13 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
  * @param d_results Final results vector of calculate values
  */
 template <typename hash_value_type, int blocks_per_string>
-CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
-                                         cudf::device_span<cudf::size_type const> indices,
-                                         cudf::device_span<hash_value_type const> parameter_a,
-                                         cudf::device_span<hash_value_type const> parameter_b,
-                                         cudf::size_type width,
-                                         hash_value_type const* d_hashes,
-                                         hash_value_type* d_results)
+CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+                                cudf::device_span<cudf::size_type const> indices,
+                                cudf::device_span<hash_value_type const> parameter_a,
+                                cudf::device_span<hash_value_type const> parameter_b,
+                                cudf::size_type width,
+                                hash_value_type const* d_hashes,
+                                hash_value_type* d_results)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
   auto const idx = (tid / blocks_per_string) / block_size;
@@ -338,7 +225,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       ? section_size
       : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
 
-  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const init     = size_bytes == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = block.thread_rank();
   auto const d_output = d_results + (str_idx * parameter_a.size());
 
@@ -347,7 +234,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
 
   // constants used in the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
-  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+  constexpr hash_value_type hash_max = cuda::std::numeric_limits<hash_value_type>::max();
 
   // found to be an efficient shared memory size for both hash types
   __shared__ hash_value_type block_values[block_size * params_per_thread];
@@ -478,7 +365,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
                                block_size};
-    minhash_permuted_kernel<hash_value_type, 1>
+    minhash_kernel<hash_value_type, 1>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -489,7 +376,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto d_indices =
       cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
     cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
-    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+    minhash_kernel<hash_value_type, blocks_per_string>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
         *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
@@ -497,101 +384,6 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return results;
 }
 
-/**
- * @brief Compute the minhash of each list row of strings for each seed
- *
- * This is a warp-per-row algorithm where parallel threads within a warp
- * work on strings in a single list row.
- *
- * @tparam HashFunction hash function to use on each string
- *
- * @param d_input List of strings to process
- * @param seeds Seeds for hashing each string
- * @param d_hashes Minhash output values (one per row)
- */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
-                                     cudf::device_span<hash_value_type const> seeds,
-                                     hash_value_type* d_hashes)
-{
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const row_idx = idx / cudf::detail::warp_size;
-
-  if (row_idx >= d_input.size()) { return; }
-  if (d_input.is_null(row_idx)) { return; }
-
-  auto const d_row    = cudf::list_device_view(d_input, row_idx);
-  auto const d_output = d_hashes + (row_idx * seeds.size());
-
-  // initialize hashes output for this row
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
-  if (lane_idx == 0) {
-    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
-  }
-  __syncwarp();
-
-  // each lane hashes a string from the input row
-  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
-    auto const hash_str =
-      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
-    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
-      auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash string and store the min value
-      hash_value_type hv;
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv = hasher(hash_str);
-      } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        hv = thrust::get<0>(hasher(hash_str));
-      }
-      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
-    }
-  }
-}
-
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
-                                              cudf::device_span<hash_value_type const> seeds,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
-
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
-
-  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
-
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
-  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
-
-  constexpr cudf::thread_index_type block_size = 256;
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
-  minhash_word_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
-
-  return hashes;
-}
-
 std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
@@ -620,30 +412,6 @@ std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
 }
 }  // namespace
 
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> const& seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       uint32_t seed,
                                       cudf::device_span<uint32_t const> parameter_a,
@@ -658,30 +426,6 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> const& seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  hashes->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr), input.null_count());
-  return hashes;
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
                                         cudf::device_span<uint64_t const> parameter_a,
@@ -696,45 +440,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
-}
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::numeric_scalar<uint32_t> seed,
-                                      cudf::size_type width,
-                                      rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
-                                      cudf::device_span<uint32_t const> seeds,
+                                      uint32_t seed,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seeds, width, stream, mr);
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
@@ -750,23 +467,15 @@ std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const&
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::numeric_scalar<uint64_t> seed,
-                                        cudf::size_type width,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seed, width, stream, mr);
-}
-
-std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
-                                        cudf::device_span<uint64_t const> seeds,
+                                        uint64_t seed,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seeds, width, stream, mr);
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
@@ -781,21 +490,4 @@ std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const
   return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
-std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
-                                           cudf::device_span<uint32_t const> seeds,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash(input, seeds, stream, mr);
-}
-
-std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
-                                             cudf::device_span<uint64_t const> seeds,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::word_minhash64(input, seeds, stream, mr);
-}
 }  // namespace nvtext
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 943bcbe9b3a..b041ce3ce0a 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -196,7 +197,7 @@ struct sub_offset_fn {
   {
     // keep delimiter search within this sub-block
     auto const end =
-      d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
+      d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
     // starting point of this sub-block
     auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE);
     while ((itr < end) &&
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index a3bed45e4bd..7a39199011e 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings,
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
 
-  for (int i = 0; i < std::min(static_cast<size_t>(max_utf8_blocks_for_char),
-                               total_bytes - start_byte_for_thread);
+  for (int i = 0; i < cuda::std::min(static_cast<size_t>(max_utf8_blocks_for_char),
+                                     total_bytes - start_byte_for_thread);
        ++i) {
     utf8_blocks[i] = strings[start_byte_for_thread + i];
   }
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index b13ad0a7de8..ee51a426eac 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -19,6 +19,8 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -198,8 +200,8 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::getline(hash_file, line);
   result.num_bins = str_to_uint32(line, line_no++);
 
-  std::vector<uint64_t> bin_coefficients(result.num_bins);
-  std::vector<uint16_t> bin_offsets(result.num_bins);
+  auto bin_coefficients = cudf::detail::make_host_vector<uint64_t>(result.num_bins, stream);
+  auto bin_offsets      = cudf::detail::make_host_vector<uint16_t>(result.num_bins, stream);
 
   for (int i = 0; i < result.num_bins; ++i) {
     std::getline(hash_file, line);
@@ -216,7 +218,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 
   std::getline(hash_file, line);
   uint64_t hash_table_length = str_to_uint64(line, line_no++);
-  std::vector<uint64_t> table(hash_table_length);
+  auto table                 = cudf::detail::make_host_vector<uint64_t>(hash_table_length, stream);
 
   std::generate(table.begin(), table.end(), [&hash_file, &line_no]() {
     std::string line;
@@ -239,33 +241,32 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
                                            cudf::mask_state::UNALLOCATED,
                                            stream,
                                            mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.table->mutable_view().data<uint64_t>(),
-                                table.data(),
-                                table.size() * sizeof(uint64_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint64_t>(
+    cudf::device_span<uint64_t>(result.table->mutable_view().data<uint64_t>(), table.size()),
+    table,
+    stream);
 
   result.bin_coefficients = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT64},
                                                       bin_coefficients.size(),
                                                       cudf::mask_state::UNALLOCATED,
                                                       stream,
                                                       mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_coefficients->mutable_view().data<uint64_t>(),
-                                bin_coefficients.data(),
-                                bin_coefficients.size() * sizeof(uint64_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint64_t>(
+    cudf::device_span<uint64_t>(result.bin_coefficients->mutable_view().data<uint64_t>(),
+                                bin_coefficients.size()),
+    bin_coefficients,
+    stream);
 
   result.bin_offsets = cudf::make_numeric_column(cudf::data_type{cudf::type_id::UINT16},
                                                  bin_offsets.size(),
                                                  cudf::mask_state::UNALLOCATED,
                                                  stream,
                                                  mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(result.bin_offsets->mutable_view().data<uint16_t>(),
-                                bin_offsets.data(),
-                                bin_offsets.size() * sizeof(uint16_t),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  cudf::detail::cuda_memcpy_async<uint16_t>(
+    cudf::device_span<uint16_t>(result.bin_offsets->mutable_view().data<uint16_t>(),
+                                bin_offsets.size()),
+    bin_offsets,
+    stream);
 
   auto cp_metadata            = detail::get_codepoint_metadata(stream);
   auto const cp_metadata_size = static_cast<cudf::size_type>(cp_metadata.size());
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index dd1e8ddb027..19f144dd158 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
+#include <cuda/std/limits>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -87,7 +89,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
-    uint32_t val_to_write = std::numeric_limits<uint32_t>::max();
+    uint32_t val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) &&
         (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) {
       val_to_write = char_for_thread;
@@ -95,7 +97,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
     start_word_indices[char_for_thread] = val_to_write;
 
     // Deal with the end_word_indices_array
-    val_to_write = std::numeric_limits<uint32_t>::max();
+    val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) &&
         (char_for_thread + 1 < num_code_points) &&
         (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) {
@@ -103,7 +105,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
     }
     end_word_indices[char_for_thread] = val_to_write;
 
-    token_ids[char_for_thread]       = std::numeric_limits<uint32_t>::max();
+    token_ids[char_for_thread]       = cuda::std::numeric_limits<uint32_t>::max();
     tokens_per_word[char_for_thread] = 0;
   }
 }
@@ -214,7 +216,7 @@ struct mark_special_tokens {
   __device__ void operator()(size_t idx) const
   {
     uint32_t const start_index = start_word_indices[idx];
-    if ((start_index == std::numeric_limits<uint32_t>::max()) ||
+    if ((start_index == cuda::std::numeric_limits<uint32_t>::max()) ||
         ((start_index + MIN_ST_WIDTH + 2) > num_code_points))
       return;
     if (code_points[start_index] != '[') return;
@@ -225,12 +227,12 @@ struct mark_special_tokens {
     uint32_t const end_index = [&] {
       auto const begin = start_word_indices + start_pos;
       auto const width =
-        std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
+        cuda::std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
       auto const end = begin + width;
       // checking the next start-word is more reliable than arbitrarily searching for ']'
       // in case the text is split across string rows
       auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) {
-        return swi != std::numeric_limits<uint32_t>::max();
+        return swi != cuda::std::numeric_limits<uint32_t>::max();
       });
       return iter == end ? start_index : static_cast<uint32_t>(iter - start_word_indices);
     }();
@@ -254,11 +256,11 @@ struct mark_special_tokens {
     thrust::fill(thrust::seq,
                  start_word_indices + start_index + 1,  // keep the first one
                  start_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
     thrust::fill(thrust::seq,
                  end_word_indices + start_index,
                  end_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
 
     // reset the new end-word index
     end_word_indices[end_pos] = end_pos + 1;
@@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
       // We need to clean up the global array. This case is very uncommon.
       //  Only 0.016% of words cannot be resolved to a token from the squad dev set.
       for (uint32_t i = 1; i < num_values_tokenized; ++i) {
-        token_ids[token_start + i] = std::numeric_limits<uint32_t>::max();
+        token_ids[token_start + i] = cuda::std::numeric_limits<uint32_t>::max();
       }
       num_values_tokenized = 0;
     }
@@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu
 }
 
 struct copy_if_fn {  // inline lambda not allowed in private or protected member function
-  __device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits<uint32_t>::max(); }
+  __device__ bool operator()(uint32_t cp)
+  {
+    return cp != cuda::std::numeric_limits<uint32_t>::max();
+  }
 };
 
 struct tranform_fn {  // just converting uint8 value to uint32
@@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   auto itr_end = thrust::remove(rmm::exec_policy(stream),
                                 device_word_indices.begin(),
                                 device_word_indices.end(),
-                                std::numeric_limits<uint32_t>::max());
+                                cuda::std::numeric_limits<uint32_t>::max());
 
   // The number of tokens selected will be double the number of words since we
   // select from both the start and end index arrays.
@@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // token so this will always have enough memory to store the contiguous tokens.
   uint32_t* contiguous_token_ids = device_code_points;
   auto const copy_size           =  // thrust::copy_if limited to copying int-max values
-    std::min(device_token_ids.size(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
+    cuda::std::min(device_token_ids.size(),
+                   static_cast<std::size_t>(cuda::std::numeric_limits<int>::max()));
   auto ids_itr       = device_token_ids.begin();
   auto const ids_end = device_token_ids.end();
   while (ids_itr != ids_end) {
diff --git a/cpp/src/transform/jit/kernel.cu b/cpp/src/transform/jit/kernel.cu
index 4fd0369c26b..9d96c11c3f2 100644
--- a/cpp/src/transform/jit/kernel.cu
+++ b/cpp/src/transform/jit/kernel.cu
@@ -38,8 +38,9 @@ CUDF_KERNEL void kernel(cudf::size_type size, TypeOut* out_data, TypeIn* in_data
 {
   // cannot use global_thread_id utility due to a JIT build issue by including
   // the `cudf/detail/utilities/cuda.cuh` header
-  thread_index_type const start  = threadIdx.x + blockIdx.x * blockDim.x;
-  thread_index_type const stride = blockDim.x * gridDim.x;
+  auto const block_size          = static_cast<thread_index_type>(blockDim.x);
+  thread_index_type const start  = threadIdx.x + blockIdx.x * block_size;
+  thread_index_type const stride = block_size * gridDim.x;
 
   for (auto i = start; i < static_cast<thread_index_type>(size); i += stride) {
     GENERIC_UNARY_OP(&out_data[i], in_data[i]);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 66bbe532e46..39c11295fbd 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -413,7 +413,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
                                        size_type max_branch_depth)
 {
   extern __shared__ row_span thread_branch_stacks[];
-  int const tid = threadIdx.x + blockIdx.x * blockDim.x;
+  auto const tid = static_cast<size_type>(cudf::detail::grid_1d::global_thread_id());
 
   auto const num_segments = static_cast<size_type>(output.size());
   if (tid >= num_segments) { return; }
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index e30806a5011..94d27d976c3 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,10 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
+#include "io/utilities/getenv_or.hpp"
+
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -46,7 +48,7 @@ class fixed_pinned_pool_memory_resource {
       pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)},
       pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)}
   {
-    CUDF_LOG_INFO("Pinned pool size = {}", pool_size_);
+    CUDF_LOG_INFO("Pinned pool size = %zu", pool_size_);
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
     pool_begin_ = pool_->allocate_async(pool_size_, stream_);
@@ -277,7 +279,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
 CUDF_EXPORT auto& kernel_pinned_copy_threshold()
 {
   // use cudaMemcpyAsync for all pinned copies
-  static std::atomic<size_t> threshold = 0;
+  static std::atomic<size_t> threshold = getenv_or("LIBCUDF_KERNEL_PINNED_COPY_THRESHOLD", 0);
   return threshold;
 }
 
@@ -291,7 +293,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(
 CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
 {
   // use pageable memory for all host allocations
-  static std::atomic<size_t> threshold = 0;
+  static std::atomic<size_t> threshold = getenv_or("LIBCUDF_ALLOCATE_HOST_AS_PINNED_THRESHOLD", 0);
   return threshold;
 }
 
diff --git a/cpp/src/utilities/logger.cpp b/cpp/src/utilities/logger.cpp
deleted file mode 100644
index e52fffbd8c6..00000000000
--- a/cpp/src/utilities/logger.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/logger.hpp>
-
-#include <spdlog/sinks/basic_file_sink.h>
-#include <spdlog/sinks/stdout_sinks.h>
-
-#include <string>
-
-namespace {
-
-/**
- * @brief Creates a sink for libcudf logging.
- *
- * Returns a file sink if the file name has been specified, otherwise returns a stderr sink.
- */
-[[nodiscard]] spdlog::sink_ptr make_libcudf_sink()
-{
-  if (auto filename = std::getenv("LIBCUDF_DEBUG_LOG_FILE"); filename != nullptr) {
-    return std::make_shared<spdlog::sinks::basic_file_sink_mt>(filename, true);
-  } else {
-    return std::make_shared<spdlog::sinks::stderr_sink_mt>();
-  }
-}
-
-/**
- * @brief Converts the level name into the `spdlog` level enum.
- */
-[[nodiscard]] spdlog::level::level_enum libcudf_log_level()
-{
-  auto const env_level = std::getenv("LIBCUDF_LOGGING_LEVEL");
-  if (env_level == nullptr) { return spdlog::level::warn; }
-
-  auto const env_lvl_str = std::string(env_level);
-  if (env_lvl_str == "TRACE") return spdlog::level::trace;
-  if (env_lvl_str == "DEBUG") return spdlog::level::debug;
-  if (env_lvl_str == "INFO") return spdlog::level::info;
-  if (env_lvl_str == "WARN") return spdlog::level::warn;
-  if (env_lvl_str == "ERROR") return spdlog::level::err;
-  if (env_lvl_str == "CRITICAL") return spdlog::level::critical;
-  if (env_lvl_str == "OFF") return spdlog::level::off;
-
-  CUDF_FAIL("Invalid value for LIBCUDF_LOGGING_LEVEL environment variable");
-}
-
-/**
- * @brief Simple wrapper around a spdlog::logger that performs cuDF-specific initialization.
- */
-struct logger_wrapper {
-  spdlog::logger logger_;
-
-  logger_wrapper() : logger_{"CUDF", make_libcudf_sink()}
-  {
-    logger_.set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
-    logger_.set_level(libcudf_log_level());
-    logger_.flush_on(spdlog::level::warn);
-  }
-};
-
-}  // namespace
-
-spdlog::logger& cudf::detail::logger()
-{
-  static logger_wrapper wrapped{};
-  return wrapped.logger_;
-}
-
-spdlog::logger& cudf::logger() { return cudf::detail::logger(); }
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 9d1bebd1937..80364885980 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/logger.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -129,7 +129,8 @@ class rmm_cuda_stream_pool : public cuda_stream_pool {
   std::vector<rmm::cuda_stream_view> get_streams(std::size_t count) override
   {
     if (count > STREAM_POOL_SIZE) {
-      CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE);
+      CUDF_LOG_WARN(
+        "get_streams called with count (%zu) > pool size (%zu)", count, STREAM_POOL_SIZE);
     }
     auto streams = std::vector<rmm::cuda_stream_view>();
     for (uint32_t i = 0; i < count; i++) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index adf512811cc..344979e1288 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -132,6 +132,8 @@ ConfigureTest(
   groupby/groupby_test_util.cpp
   groupby/groups_tests.cpp
   groupby/histogram_tests.cpp
+  groupby/host_udf_example_tests.cu
+  groupby/host_udf_tests.cpp
   groupby/keys_tests.cpp
   groupby/lists_tests.cpp
   groupby/m2_tests.cpp
@@ -190,6 +192,7 @@ ConfigureTest(
   hashing/sha256_test.cpp
   hashing/sha384_test.cpp
   hashing/sha512_test.cpp
+  hashing/xxhash_32_test.cpp
   hashing/xxhash_64_test.cpp
 )
 
diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index e95c9fb41c6..9f8d22ea94d 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -31,6 +31,7 @@
 #include <algorithm>
 #include <iostream>
 
+namespace {
 struct valid_bit_functor {
   cudf::bitmask_type const* _null_mask;
   __device__ bool operator()(cudf::size_type element_index) const noexcept
@@ -38,13 +39,7 @@ struct valid_bit_functor {
     return cudf::bit_is_set(_null_mask, element_index);
   }
 };
-
-std::ostream& operator<<(std::ostream& stream, thrust::host_vector<bool> const& bits)
-{
-  for (auto _bit : bits)
-    stream << int(_bit);
-  return stream;
-}
+}  // namespace
 
 struct SetBitmaskTest : public cudf::test::BaseFixture {
   void expect_bitmask_equal(cudf::bitmask_type const* bitmask,  // Device Ptr
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 96f122f21a8..8ffcc552ecb 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -28,6 +28,7 @@
 
 struct ValidIfTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct odds_valid {
   __host__ __device__ bool operator()(cudf::size_type i) { return i % 2; }
 };
@@ -37,6 +38,7 @@ struct all_valid {
 struct all_null {
   __host__ __device__ bool operator()(cudf::size_type i) { return false; }
 };
+}  // namespace
 
 TEST_F(ValidIfTest, EmptyRange)
 {
diff --git a/cpp/tests/column/bit_cast_test.cpp b/cpp/tests/column/bit_cast_test.cpp
index 5570a7d498c..1f29ea9e5fc 100644
--- a/cpp/tests/column/bit_cast_test.cpp
+++ b/cpp/tests/column/bit_cast_test.cpp
@@ -25,6 +25,7 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 template <typename T, typename T2 = void>
 struct rep_type_impl {
   using type = void;
@@ -47,12 +48,14 @@ struct rep_type_impl<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
 
 template <typename T>
 using rep_type_t = typename rep_type_impl<T>::type;
+}  // namespace
 
 template <typename T>
 struct ColumnViewAllTypesTests : public cudf::test::BaseFixture {};
 
 TYPED_TEST_SUITE(ColumnViewAllTypesTests, cudf::test::FixedWidthTypes);
 
+namespace {
 template <typename FromType, typename ToType, typename Iterator>
 void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator end)
 {
@@ -102,6 +105,7 @@ void do_bit_cast(cudf::column_view const& column_view, Iterator begin, Iterator
     }
   }
 }
+}  // namespace
 
 TYPED_TEST(ColumnViewAllTypesTests, BitCast)
 {
diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu
index d7e93fb22a3..fff3282fdd5 100644
--- a/cpp/tests/column/compound_test.cu
+++ b/cpp/tests/column/compound_test.cu
@@ -34,6 +34,7 @@
 
 struct CompoundColumnTest : public cudf::test::BaseFixture {};
 
+namespace {
 template <typename ColumnDeviceView>
 struct checker_for_level1 {
   ColumnDeviceView d_column;
@@ -62,6 +63,7 @@ struct checker_for_level2 {
     return bcheck;
   }
 };
+}  // namespace
 
 TEST_F(CompoundColumnTest, ChildrenLevel1)
 {
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index b81f8196d89..2fb24f6b31e 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -31,6 +31,7 @@
 
 #include <algorithm>
 
+namespace {
 template <typename T>
 CUDF_KERNEL void gpu_atomic_test(T* result, T* data, size_t size)
 {
@@ -109,6 +110,7 @@ std::enable_if_t<cudf::is_timestamp<T>(), T> accumulate(cudf::host_span<T const>
     xs.begin(), xs.end(), ys.begin(), [](T const& ts) { return ts.time_since_epoch().count(); });
   return T{typename T::duration{std::accumulate(ys.begin(), ys.end(), 0)}};
 }
+}  // namespace
 
 template <typename T>
 struct AtomicsTest : public cudf::test::BaseFixture {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index b96c6909e55..f8f8d525043 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -577,10 +577,12 @@ TEST_F(FixedPointTest, Decimal32FloatVector)
   float_vector_test(0.15, 20, -2, std::multiplies<>());
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index f34760341d8..ddc48c97012 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -72,10 +72,12 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXThrust)
   EXPECT_EQ(vec2, vec3);
 }
 
+namespace {
 struct cast_to_int32_fn {
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
   int32_t __host__ __device__ operator()(decimal32 fp) { return static_cast<int32_t>(fp); }
 };
+}  // namespace
 
 TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 {
diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu
new file mode 100644
index 00000000000..a454bd692fc
--- /dev/null
+++ b/cpp/tests/groupby/host_udf_example_tests.cu
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+
+namespace {
+/**
+ * @brief A host-based UDF implementation for groupby.
+ *
+ * For each group of values, the aggregation computes
+ * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`.
+ */
+struct host_udf_groupby_example : cudf::host_udf_base {
+  host_udf_groupby_example() = default;
+
+  [[nodiscard]] data_attribute_set_t get_required_data() const override
+  {
+    // We need grouped values, group offsets, group labels, and also results from groups'
+    // MAX and SUM aggregations.
+    return {groupby_data_attribute::GROUPED_VALUES,
+            groupby_data_attribute::GROUP_OFFSETS,
+            groupby_data_attribute::GROUP_LABELS,
+            cudf::make_max_aggregation<cudf::groupby_aggregation>(),
+            cudf::make_sum_aggregation<cudf::groupby_aggregation>()};
+  }
+
+  [[nodiscard]] output_t get_empty_output(
+    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+    [[maybe_unused]] rmm::cuda_stream_view stream,
+    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  {
+    return cudf::make_empty_column(
+      cudf::data_type{cudf::type_to_id<typename groupby_fn::OutputType>()});
+  }
+
+  [[nodiscard]] output_t operator()(input_map_t const& input,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr) const override
+  {
+    auto const& values =
+      std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
+    return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr);
+  }
+
+  [[nodiscard]] std::size_t do_hash() const override
+  {
+    // Just return the same hash for all instances of this class.
+    return std::size_t{12345};
+  }
+
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override
+  {
+    // Just check if the other object is also instance of this class.
+    return dynamic_cast<host_udf_groupby_example const*>(&other) != nullptr;
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_example>();
+  }
+
+  struct groupby_fn {
+    // Store pointer to the parent class so we can call its functions.
+    host_udf_groupby_example const* parent;
+
+    // For simplicity, this example only accepts double input and always produces double output.
+    using InputType  = double;
+    using OutputType = double;
+
+    template <typename T, typename... Args, CUDF_ENABLE_IF(!std::is_same_v<InputType, T>)>
+    output_t operator()(Args...) const
+    {
+      CUDF_FAIL("Unsupported input type.");
+    }
+
+    template <typename T, CUDF_ENABLE_IF(std::is_same_v<InputType, T>)>
+    output_t operator()(input_map_t const& input,
+                        rmm::cuda_stream_view stream,
+                        rmm::device_async_resource_ref mr) const
+    {
+      auto const& values =
+        std::get<cudf::column_view>(input.at(groupby_data_attribute::GROUPED_VALUES));
+      if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); }
+
+      auto const offsets = std::get<cudf::device_span<cudf::size_type const>>(
+        input.at(groupby_data_attribute::GROUP_OFFSETS));
+      CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets.");
+      auto const num_groups    = static_cast<int>(offsets.size()) - 1;
+      auto const group_indices = std::get<cudf::device_span<cudf::size_type const>>(
+        input.at(groupby_data_attribute::GROUP_LABELS));
+      auto const group_max = std::get<cudf::column_view>(
+        input.at(cudf::make_max_aggregation<cudf::groupby_aggregation>()));
+      auto const group_sum = std::get<cudf::column_view>(
+        input.at(cudf::make_sum_aggregation<cudf::groupby_aggregation>()));
+
+      auto const values_dv_ptr = cudf::column_device_view::create(values, stream);
+      auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<OutputType>()},
+                                                    num_groups,
+                                                    cudf::mask_state::UNALLOCATED,
+                                                    stream,
+                                                    mr);
+
+      // Store row index if it is valid, otherwise store a negative value denoting a null row.
+      rmm::device_uvector<cudf::size_type> valid_idx(num_groups, stream);
+
+      thrust::transform(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_groups),
+        thrust::make_zip_iterator(output->mutable_view().begin<OutputType>(), valid_idx.begin()),
+        transform_fn{*values_dv_ptr,
+                     offsets,
+                     group_indices,
+                     group_max.begin<InputType>(),
+                     group_sum.begin<InputType>()});
+
+      auto const valid_idx_cv = cudf::column_view{
+        cudf::data_type{cudf::type_id::INT32}, num_groups, valid_idx.begin(), nullptr, 0};
+      return std::move(cudf::gather(cudf::table_view{{output->view()}},
+                                    valid_idx_cv,
+                                    cudf::out_of_bounds_policy::NULLIFY,
+                                    stream,
+                                    mr)
+                         ->release()
+                         .front());
+    }
+
+    struct transform_fn {
+      cudf::column_device_view values;
+      cudf::device_span<cudf::size_type const> offsets;
+      cudf::device_span<cudf::size_type const> group_indices;
+      InputType const* group_max;
+      InputType const* group_sum;
+
+      thrust::tuple<OutputType, cudf::size_type> __device__ operator()(cudf::size_type idx) const
+      {
+        auto const start = offsets[idx];
+        auto const end   = offsets[idx + 1];
+
+        auto constexpr invalid_idx = cuda::std::numeric_limits<cudf::size_type>::lowest();
+        if (start == end) { return {OutputType{0}, invalid_idx}; }
+
+        auto sum_sqr = OutputType{0};
+        bool has_valid{false};
+        for (auto i = start; i < end; ++i) {
+          if (values.is_null(i)) { continue; }
+          has_valid      = true;
+          auto const val = static_cast<OutputType>(values.element<InputType>(i));
+          sum_sqr += val * val;
+        }
+
+        if (!has_valid) { return {OutputType{0}, invalid_idx}; }
+        return {static_cast<OutputType>(group_indices[start] + 1) * sum_sqr -
+                  static_cast<OutputType>(group_max[idx]) * static_cast<OutputType>(group_sum[idx]),
+                idx};
+      }
+    };
+  };
+};
+
+}  // namespace
+
+using doubles_col = cudf::test::fixed_width_column_wrapper<double>;
+using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFGroupbyExampleTest, SimpleInput)
+{
+  double constexpr null = 0.0;
+  auto const keys       = int32s_col{0, 1, 2, 0, 1, 2, 0, 1, 2, 0};
+  auto const vals       = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0},
+                                      {true, false, true, true, false, true, false, false, true, true}};
+  auto agg              = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+    std::make_unique<host_udf_groupby_example>());
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg));
+  cudf::groupby::groupby gb_obj(
+    cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
+
+  auto const grp_result = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  auto const& result    = grp_result.second[0].results[0];
+
+  // Output type of groupby is double.
+  // Values grouped by keys: [ {0, 3, null, 9}, {null, null, null}, {2, 5, 8} ]
+  // Group sum_sqr: [ 90, null, 93 ]
+  // Group max: [ 9, null, 8 ]
+  // Group sum: [ 12, null, 15 ]
+  // Output: [ 1 * 90 - 9 * 12, null, 3 * 93 - 8 * 15 ]
+  auto const expected = doubles_col{{-18.0, null, 159.0}, {true, false, true}};
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
+}
+
+TEST_F(HostUDFGroupbyExampleTest, EmptyInput)
+{
+  auto const keys = int32s_col{};
+  auto const vals = doubles_col{};
+  auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+    std::make_unique<host_udf_groupby_example>());
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg));
+  cudf::groupby::groupby gb_obj(
+    cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
+
+  auto const grp_result = gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  auto const& result    = grp_result.second[0].results[0];
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(vals, *result);
+}
diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp
new file mode 100644
index 00000000000..1a0f68c0c6c
--- /dev/null
+++ b/cpp/tests/groupby/host_udf_tests.cpp
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/aggregation/host_udf.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction.hpp>
+
+#include <random>
+#include <vector>
+
+namespace {
+/**
+ * @brief A host-based UDF implementation used for unit tests.
+ */
+struct host_udf_test_base : cudf::host_udf_base {
+  int test_location_line;  // the location where testing is called
+  bool* test_run;          // to check if the test is accidentally skipped
+  data_attribute_set_t input_attrs;
+
+  host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_)
+    : test_location_line{test_location_line_},
+      test_run{test_run_},
+      input_attrs(std::move(input_attrs_))
+  {
+  }
+
+  [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; }
+
+  // This is the main testing function, which checks for the correctness of input data.
+  // The rests are just to satisfy the interface.
+  [[nodiscard]] output_t operator()(input_map_t const& input,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr) const override
+  {
+    SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line));
+
+    test_data_attributes(input, stream, mr);
+
+    *test_run = true;  // test is run successfully
+    return get_empty_output(std::nullopt, stream, mr);
+  }
+
+  [[nodiscard]] output_t get_empty_output(
+    [[maybe_unused]] std::optional<cudf::data_type> output_dtype,
+    [[maybe_unused]] rmm::cuda_stream_view stream,
+    [[maybe_unused]] rmm::device_async_resource_ref mr) const override
+  {
+    // Unused function - dummy output.
+    return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32});
+  }
+
+  [[nodiscard]] std::size_t do_hash() const override { return 0; }
+  [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; }
+
+  // The main test function, which must be implemented for each kind of aggregations
+  // (groupby/reduction/segmented_reduction).
+  virtual void test_data_attributes(input_map_t const& input,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr) const = 0;
+};
+
+/**
+ * @brief A host-based UDF implementation used for unit tests for groupby aggregation.
+ */
+struct host_udf_groupby_test : host_udf_test_base {
+  host_udf_groupby_test(int test_location_line_,
+                        bool* test_run_,
+                        data_attribute_set_t input_attrs_ = {})
+    : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_))
+  {
+  }
+
+  [[nodiscard]] std::unique_ptr<host_udf_base> clone() const override
+  {
+    return std::make_unique<host_udf_groupby_test>(test_location_line, test_run, input_attrs);
+  }
+
+  void test_data_attributes(input_map_t const& input,
+                            rmm::cuda_stream_view stream,
+                            rmm::device_async_resource_ref mr) const override
+  {
+    data_attribute_set_t check_attrs = input_attrs;
+    if (check_attrs.empty()) {
+      check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES,
+                                         groupby_data_attribute::GROUPED_VALUES,
+                                         groupby_data_attribute::SORTED_GROUPED_VALUES,
+                                         groupby_data_attribute::NUM_GROUPS,
+                                         groupby_data_attribute::GROUP_OFFSETS,
+                                         groupby_data_attribute::GROUP_LABELS};
+    }
+    EXPECT_EQ(input.size(), check_attrs.size());
+    for (auto const& attr : check_attrs) {
+      EXPECT_TRUE(input.count(attr) > 0);
+      EXPECT_TRUE(std::holds_alternative<groupby_data_attribute>(attr.value) ||
+                  std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value));
+      if (std::holds_alternative<groupby_data_attribute>(attr.value)) {
+        switch (std::get<groupby_data_attribute>(attr.value)) {
+          case groupby_data_attribute::INPUT_VALUES:
+            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
+            break;
+          case groupby_data_attribute::GROUPED_VALUES:
+            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
+            break;
+          case groupby_data_attribute::SORTED_GROUPED_VALUES:
+            EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
+            break;
+          case groupby_data_attribute::NUM_GROUPS:
+            EXPECT_TRUE(std::holds_alternative<cudf::size_type>(input.at(attr)));
+            break;
+          case groupby_data_attribute::GROUP_OFFSETS:
+            EXPECT_TRUE(
+              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
+            break;
+          case groupby_data_attribute::GROUP_LABELS:
+            EXPECT_TRUE(
+              std::holds_alternative<cudf::device_span<cudf::size_type const>>(input.at(attr)));
+            break;
+          default:;
+        }
+      } else {  // std::holds_alternative<std::unique_ptr<cudf::aggregation>>(attr.value)
+        EXPECT_TRUE(std::holds_alternative<cudf::column_view>(input.at(attr)));
+      }
+    }
+  }
+};
+
+/**
+ * @brief Get a random subset of input data attributes.
+ */
+cudf::host_udf_base::data_attribute_set_t get_subset(
+  cudf::host_udf_base::data_attribute_set_t const& attrs)
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<std::size_t> size_distr(1, attrs.size() - 1);
+  auto const subset_size = size_distr(gen);
+  auto const elements =
+    std::vector<cudf::host_udf_base::data_attribute>(attrs.begin(), attrs.end());
+  std::uniform_int_distribution<std::size_t> idx_distr(0, attrs.size() - 1);
+  cudf::host_udf_base::data_attribute_set_t output;
+  while (output.size() < subset_size) {
+    output.insert(elements[idx_distr(gen)]);
+  }
+  return output;
+}
+
+/**
+ * @brief Generate a random aggregation object from {min, max, sum, product}.
+ */
+std::unique_ptr<cudf::aggregation> get_random_agg()
+{
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<int> distr(1, 4);
+  switch (distr(gen)) {
+    case 1: return cudf::make_min_aggregation();
+    case 2: return cudf::make_max_aggregation();
+    case 3: return cudf::make_sum_aggregation();
+    case 4: return cudf::make_product_aggregation();
+    default: CUDF_UNREACHABLE("This should not be reached.");
+  }
+  return nullptr;
+}
+
+}  // namespace
+
+using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+
+// Number of randomly testing on the input data attributes.
+// For each test, a subset of data attributes will be randomly generated from all the possible input
+// data attributes. The input data corresponding to that subset passed from libcudf will be tested
+// for correctness.
+constexpr int NUM_RANDOM_TESTS = 20;
+
+struct HostUDFTest : cudf::test::BaseFixture {};
+
+TEST_F(HostUDFTest, GroupbyAllInput)
+{
+  bool test_run   = false;
+  auto const keys = int32s_col{0, 1, 2};
+  auto const vals = int32s_col{0, 1, 2};
+  auto agg        = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+    std::make_unique<host_udf_groupby_test>(__LINE__, &test_run));
+
+  std::vector<cudf::groupby::aggregation_request> requests;
+  requests.emplace_back();
+  requests[0].values = vals;
+  requests[0].aggregations.push_back(std::move(agg));
+  cudf::groupby::groupby gb_obj(
+    cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
+  [[maybe_unused]] auto const grp_result =
+    gb_obj.aggregate(requests, cudf::test::get_default_stream());
+  EXPECT_TRUE(test_run);
+}
+
+TEST_F(HostUDFTest, GroupbySomeInput)
+{
+  auto const keys      = int32s_col{0, 1, 2};
+  auto const vals      = int32s_col{0, 1, 2};
+  auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{
+    cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES,
+    cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES,
+    cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES,
+    cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS,
+    cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS,
+    cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS};
+  for (int i = 0; i < NUM_RANDOM_TESTS; ++i) {
+    bool test_run    = false;
+    auto input_attrs = get_subset(all_attrs);
+    input_attrs.insert(get_random_agg());
+    auto agg = cudf::make_host_udf_aggregation<cudf::groupby_aggregation>(
+      std::make_unique<host_udf_groupby_test>(__LINE__, &test_run, std::move(input_attrs)));
+
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.emplace_back();
+    requests[0].values = vals;
+    requests[0].aggregations.push_back(std::move(agg));
+    cudf::groupby::groupby gb_obj(
+      cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {});
+    [[maybe_unused]] auto const grp_result =
+      gb_obj.aggregate(requests, cudf::test::get_default_stream());
+    EXPECT_TRUE(test_run);
+  }
+}
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 4ae5d06b214..883a5093bd1 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -30,6 +30,7 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 
+namespace {
 /**
  * @brief Functor to generate a tdigest by key.
  *
@@ -116,6 +117,7 @@ struct tdigest_groupby_simple_merge_op {
     return std::move(result.second[0].results[0]);
   }
 };
+}  // namespace
 
 template <typename T>
 struct TDigestAllTypes : public cudf::test::BaseFixture {};
@@ -508,6 +510,7 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
 
+namespace {
 std::unique_ptr<cudf::table> do_agg(
   cudf::column_view key,
   cudf::column_view val,
@@ -537,6 +540,7 @@ std::unique_ptr<cudf::table> do_agg(
 
   return std::make_unique<cudf::table>(std::move(result_columns));
 }
+}  // namespace
 
 TEST_F(TDigestMergeTest, AllValuesAreNull)
 {
diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp
new file mode 100644
index 00000000000..9e3c66b0d0b
--- /dev/null
+++ b/cpp/tests/hashing/xxhash_32_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/hashing.hpp>
+
+class XXHash_32_Test : public cudf::test::BaseFixture {};
+
+TEST_F(XXHash_32_Test, TestInteger)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 825}};
+  auto constexpr seed = 0u;
+  auto const output   = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, TestDouble)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<double>{{-8., 25., 90.}};
+  auto constexpr seed = 42u;
+
+  auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, StringType)
+{
+  auto col1           = cudf::test::strings_column_wrapper({"I", "am", "AI"});
+  auto constexpr seed = 825u;
+
+  auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499009u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index ef4b9dd9b8a..b7106e823dd 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -26,6 +26,7 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
@@ -60,6 +61,7 @@ void validate_dtype(DLDataType const& dtype)
   EXPECT_EQ(1, dtype.lanes);
   EXPECT_EQ(sizeof(T) * 8, dtype.bits);
 }
+}  // namespace
 
 class DLPackUntypedTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 54262dc3b44..5bbe8b63c47 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -30,6 +30,9 @@
 #include <vector>
 
 using cudf::device_span;
+using cudf::io::detail::compression_result;
+using cudf::io::detail::compression_status;
+namespace nvcomp = cudf::io::detail::nvcomp;
 
 /**
  * @brief Base test fixture for decompression
@@ -61,7 +64,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
     inf_out[0] = dst;
     inf_out.host_to_device_async(stream);
 
-    cudf::detail::hostdevice_vector<cudf::io::compression_result> inf_stat(1, stream);
+    cudf::detail::hostdevice_vector<compression_result> inf_stat(1, stream);
     inf_stat[0] = {};
     inf_stat.host_to_device_async(stream);
 
@@ -69,7 +72,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
     CUDF_CUDA_TRY(cudaMemcpyAsync(
       decompressed.data(), dst.data(), dst.size(), cudaMemcpyDefault, stream.value()));
     inf_stat.device_to_host_sync(stream);
-    ASSERT_EQ(inf_stat[0].status, cudf::io::compression_status::SUCCESS);
+    ASSERT_EQ(inf_stat[0].status, compression_status::SUCCESS);
   }
 };
 
@@ -79,13 +82,13 @@ struct DecompressTest : public cudf::test::BaseFixture {
 struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::compression_result> d_inf_stat)
+                device_span<compression_result> d_inf_stat)
   {
-    cudf::io::gpuinflate(d_inf_in,
-                         d_inf_out,
-                         d_inf_stat,
-                         cudf::io::gzip_header_included::YES,
-                         cudf::get_default_stream());
+    cudf::io::detail::gpuinflate(d_inf_in,
+                                 d_inf_out,
+                                 d_inf_stat,
+                                 cudf::io::detail::gzip_header_included::YES,
+                                 cudf::get_default_stream());
   }
 };
 
@@ -95,9 +98,9 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
 struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::compression_result> d_inf_stat)
+                device_span<compression_result> d_inf_stat)
   {
-    cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::get_default_stream());
+    cudf::io::detail::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::get_default_stream());
   }
 };
 
@@ -107,17 +110,17 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
 struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
   void dispatch(device_span<device_span<uint8_t const>> d_inf_in,
                 device_span<device_span<uint8_t>> d_inf_out,
-                device_span<cudf::io::compression_result> d_inf_stat)
+                device_span<compression_result> d_inf_stat)
   {
-    rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
+    rmm::device_buffer d_scratch{cudf::io::detail::get_gpu_debrotli_scratch_size(1),
                                  cudf::get_default_stream()};
 
-    cudf::io::gpu_debrotli(d_inf_in,
-                           d_inf_out,
-                           d_inf_stat,
-                           d_scratch.data(),
-                           d_scratch.size(),
-                           cudf::get_default_stream());
+    cudf::io::detail::gpu_debrotli(d_inf_in,
+                                   d_inf_out,
+                                   d_inf_stat,
+                                   d_scratch.data(),
+                                   d_scratch.size(),
+                                   cudf::get_default_stream());
   }
 };
 
@@ -181,8 +184,8 @@ TEST_F(BrotliDecompressTest, HelloWorld)
 
 TEST_F(NvcompConfigTest, Compression)
 {
-  using cudf::io::nvcomp::compression_type;
-  auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
+  using nvcomp::compression_type;
+  auto const& comp_disabled = nvcomp::is_compression_disabled;
 
   EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
@@ -201,8 +204,8 @@ TEST_F(NvcompConfigTest, Compression)
 
 TEST_F(NvcompConfigTest, Decompression)
 {
-  using cudf::io::nvcomp::compression_type;
-  auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
+  using nvcomp::compression_type;
+  auto const& decomp_disabled = nvcomp::is_decompression_disabled;
 
   EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 3c8db99c3c7..23ca5734ded 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -56,6 +56,8 @@ using int16_wrapper        = wrapper<int16_t>;
 using int64_wrapper        = wrapper<int64_t>;
 using timestamp_ms_wrapper = wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>;
 using bool_wrapper         = wrapper<bool>;
+using size_type_wrapper    = wrapper<cudf::size_type>;
+using strings_wrapper      = cudf::test::strings_column_wrapper;
 
 using cudf::data_type;
 using cudf::type_id;
@@ -3253,6 +3255,144 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilterWithOrder)
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), *wrapped);
     }
   }
+
+  // test list (all-null) of struct (empty) of string (empty)
+  {
+    std::string json_stringl = R"(
+    {"a" : [1], "c2": [1, 2]}
+    {}
+    )";
+    auto lines               = true;
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+        .prune_columns(true)
+        .experimental(true)
+        .lines(lines);
+
+    cudf::io::schema_element dtype_schema{
+      data_type{cudf::type_id::STRUCT},
+      {
+        {"a", {data_type{cudf::type_id::LIST}, {{"element", {dtype<int64_t>()}}}}},
+        {"c2",
+         {data_type{cudf::type_id::LIST},
+          {{"element",
+            {data_type{cudf::type_id::STRUCT},
+             {
+               {"d", {data_type{cudf::type_id::STRING}}},
+             },
+             {{"d"}}}}}}},
+      },
+      {{"a", "c2"}}};
+    in_options.set_dtypes(dtype_schema);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+    // Make sure we have column "a":[int64_t]
+    ASSERT_EQ(result.tbl->num_columns(), 2);
+    ASSERT_EQ(result.metadata.schema_info.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[0].children[1].name, "element");
+    // Make sure we have all null list "c2": [{"d": ""}]
+    EXPECT_EQ(result.metadata.schema_info[1].name, "c2");
+    ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "offsets");
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "element");
+    ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "d");
+
+    auto const expected0 = [&] {
+      auto const valids = std::vector<bool>{1, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 1, 1}.release(),
+                                     int64_wrapper{1}.release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    auto const expected1 = [&] {
+      auto const get_structs = [] {
+        auto child = cudf::test::strings_column_wrapper{};
+        return cudf::test::structs_column_wrapper{{child}};
+      };
+      auto const valids = std::vector<bool>{0, 0};
+      auto [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valids.begin(), valids.end());
+      return cudf::make_lists_column(2,
+                                     size_type_wrapper{0, 0, 0}.release(),
+                                     get_structs().release(),
+                                     null_count,
+                                     std::move(null_mask));
+    }();
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected0, result.tbl->get_column(0).view());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected1, result.tbl->get_column(1).view());
+  }
+}
+
+TEST_F(JsonReaderTest, NullifyMixedList)
+{
+  using namespace cudf::test::iterators;
+  // test list
+  std::string json_stringl = R"(
+      {"c2": []}
+      {"c2": [{}]}
+      {"c2": [[]]}
+      {"c2": [{}, [], {}]}
+      {"c2": [[123], {"b": "1"}]}
+      {"c2": [{"x": "y"}, {"b": "1"}]}
+      {}
+    )";
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  // valid     1  1  0  0  0  1  0
+  // ofset  0, 0, 1, 1, 1, 1, 3, 3
+  // child  {null, null}, {null, null}, {1, null}
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_stringl.data(), json_stringl.size()})
+      .prune_columns(true)
+      .experimental(true)
+      .lines(true);
+
+  // struct<c2: array<struct<b: string, c: string>>> eg. {"c2": [{"b": "1", "c": "2"}]}
+  cudf::io::schema_element dtype_schema{data_type{cudf::type_id::STRUCT},
+                                        {
+                                          {"c2",
+                                           {data_type{cudf::type_id::LIST},
+                                            {{"element",
+                                              {data_type{cudf::type_id::STRUCT},
+                                               {
+                                                 {"b", {data_type{cudf::type_id::STRING}}},
+                                                 {"c", {data_type{cudf::type_id::STRING}}},
+                                               },
+                                               {{"b", "c"}}}}}}},
+                                        },
+                                        {{"c2"}}};
+  in_options.set_dtypes(dtype_schema);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+  ASSERT_EQ(result.tbl->num_columns(), 1);
+  ASSERT_EQ(result.metadata.schema_info.size(), 1);
+
+  // Expected: A list of struct of 2-string columns
+  // [], [{null, null}], null, null, null, [{null, null}, {1, null}], null
+  auto get_structs = [] {
+    strings_wrapper child0{{"", "", "1"}, nulls_at({0, 0, 1})};
+    strings_wrapper child1{{"", "", ""}, all_nulls()};
+    // purge non-empty nulls in list seems to retain nullmask in struct child column
+    return cudf::test::structs_column_wrapper{{child0, child1}, no_nulls()}.release();
+  };
+  std::vector<bool> const list_nulls{1, 1, 0, 0, 0, 1, 0};
+  auto [null_mask, null_count] =
+    cudf::test::detail::make_null_mask(list_nulls.cbegin(), list_nulls.cend());
+  auto const expected = cudf::make_lists_column(
+    7,
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 1, 1, 1, 1, 3, 3}.release(),
+    get_structs(),
+    null_count,
+    std::move(null_mask));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, result.tbl->get_column(0).view());
 }
 
 struct JsonCompressedIOTest : public cudf::test::BaseFixture,
@@ -3310,4 +3450,15 @@ TEST_P(JsonCompressedIOTest, BasicJsonLines)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3}});
 }
 
+TEST_F(JsonReaderTest, MismatchedBeginEndTokens)
+{
+  std::string data = R"({"not_valid": "json)";
+  auto opts =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL)
+      .build();
+  EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 887d4fa783f..5201a46ba7d 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -34,6 +34,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 // Host copy of tree_meta_t
 struct tree_meta_t2 {
   std::vector<cuio_json::NodeT> node_categories;
@@ -43,8 +45,6 @@ struct tree_meta_t2 {
   std::vector<cuio_json::SymbolOffsetT> node_range_end;
 };
 
-namespace {
-
 tree_meta_t2 to_cpu_tree(cuio_json::tree_meta_t const& d_value, rmm::cuda_stream_view stream)
 {
   return {cudf::detail::make_std_vector_async(d_value.node_categories, stream),
diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu
index f988ae24b38..a67830a7864 100644
--- a/cpp/tests/io/json/json_tree_csr.cu
+++ b/cpp/tests/io/json/json_tree_csr.cu
@@ -36,6 +36,8 @@
 
 namespace cuio_json = cudf::io::json;
 
+namespace {
+
 struct h_tree_meta_t {
   std::vector<cuio_json::NodeT> node_categories;
   std::vector<cuio_json::NodeIndexT> parent_node_ids;
@@ -222,6 +224,7 @@ void run_test(std::string const& input, bool enable_lines = true)
   // assert equality between csr and meta formats
   ASSERT_TRUE(iseq);
 }
+}  // namespace
 
 struct JsonColumnTreeTests : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index fce99187516..2209a30149d 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -40,6 +40,8 @@
 #include <array>
 #include <type_traits>
 
+namespace nvcomp = cudf::io::detail::nvcomp;
+
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   std::conditional_t<std::is_same_v<T, cudf::string_view>,
@@ -1135,7 +1137,7 @@ TEST_F(OrcReaderTest, SingleInputs)
 
 TEST_F(OrcReaderTest, zstdCompressionRegression)
 {
-  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
+  if (nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD)) {
     GTEST_SKIP() << "Newer nvCOMP version is required";
   }
 
@@ -1700,8 +1702,8 @@ TEST_F(OrcMetadataReaderTest, TestNested)
 
 TEST_F(OrcReaderTest, ZstdMaxCompressionRate)
 {
-  if (cudf::io::nvcomp::is_decompression_disabled(cudf::io::nvcomp::compression_type::ZSTD) or
-      cudf::io::nvcomp::is_compression_disabled(cudf::io::nvcomp::compression_type::ZSTD)) {
+  if (nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD) or
+      nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD)) {
     GTEST_SKIP() << "Newer nvCOMP version is required";
   }
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 153a8a0c5aa..369376b6c95 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1074,6 +1074,7 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadNullCount)
   } while (reader.has_next());
 }
 
+namespace {
 constexpr size_t input_limit_expected_file_count = 4;
 
 std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
@@ -1133,6 +1134,7 @@ void input_limit_test_read(std::vector<std::string> const& test_filenames,
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.first, t);
   }
 }
+}  // namespace
 
 struct ParquetChunkedReaderInputLimitConstrainedTest : public cudf::test::BaseFixture {};
 
@@ -1189,6 +1191,7 @@ TEST_F(ParquetChunkedReaderInputLimitConstrainedTest, MixedColumns)
 
 struct ParquetChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
 
+namespace {
 struct offset_gen {
   int const group_size;
   __device__ int operator()(int i) { return i * group_size; }
@@ -1198,6 +1201,8 @@ template <typename T>
 struct value_gen {
   __device__ T operator()(int i) { return i % 1024; }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, List)
 {
   auto base_path      = temp_env->get_temp_filepath("list");
@@ -1263,6 +1268,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
+namespace {
 void tiny_list_rowgroup_test(bool just_list_col)
 {
   auto iter = thrust::make_counting_iterator(0);
@@ -1320,6 +1326,7 @@ void tiny_list_rowgroup_test(bool just_list_col)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *(result.first));
 }
+}  // namespace
 
 TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsSingle)
 {
@@ -1333,6 +1340,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, TinyListRowGroupsMixed)
   tiny_list_rowgroup_test(false);
 }
 
+namespace {
 struct char_values {
   __device__ int8_t operator()(int i)
   {
@@ -1341,6 +1349,8 @@ struct char_values {
     return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
   }
 };
+}  // namespace
+
 TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
 {
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 257c0979017..8377060b6ec 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -26,16 +26,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericOptionalIteratorTest : public IteratorTest<T> {};
 
@@ -46,6 +36,7 @@ TYPED_TEST(NumericOptionalIteratorTest, nonull_optional_iterator)
 }
 TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_iterator(*this); }
 
+namespace {
 // Transformers and Operators for optional_iterator test
 template <typename ElementType>
 struct transformer_optional_meanvar {
@@ -65,6 +56,7 @@ template <typename T>
 struct optional_to_meanvar {
   CUDF_HOST_DEVICE inline T operator()(cuda::std::optional<T> const& v) { return v.value_or(T{0}); }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index 3447aa0dde6..5f707232953 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -24,16 +24,6 @@
 
 using TestingTypes = cudf::test::NumericTypes;
 
-namespace cudf {
-// To print meanvar for debug.
-// Needs to be in the cudf namespace for ADL
-template <typename T>
-std::ostream& operator<<(std::ostream& os, cudf::meanvar<T> const& rhs)
-{
-  return os << "[" << rhs.value << ", " << rhs.value_squared << ", " << rhs.count << "] ";
-};
-}  // namespace cudf
-
 template <typename T>
 struct NumericPairIteratorTest : public IteratorTest<T> {};
 
@@ -53,6 +43,7 @@ struct transformer_pair_meanvar {
   };
 };
 
+namespace {
 struct sum_if_not_null {
   template <typename T>
   CUDF_HOST_DEVICE inline thrust::pair<T, bool> operator()(thrust::pair<T, bool> const& lhs,
@@ -66,6 +57,7 @@ struct sum_if_not_null {
       return {rhs};
   }
 };
+}  // namespace
 
 // TODO: enable this test also at __CUDACC_DEBUG__
 // This test causes fatal compilation error only at device debug mode.
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 9070efa38fe..e1ec8cda3ac 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ struct DistinctJoinTest : public cudf::test::BaseFixture {
     cudf::table_view const& expected_table,
     cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK)
   {
-    auto const& [build_join_indices, probe_join_indices] = result;
+    auto const& [probe_join_indices, build_join_indices] = result;
 
     auto build_indices_span = cudf::device_span<cudf::size_type const>{*build_join_indices};
     auto probe_indices_span = cudf::device_span<cudf::size_type const>{*probe_join_indices};
@@ -89,10 +89,9 @@ TEST_F(DistinctJoinTest, IntegerInnerJoin)
   auto build_table = cudf::table_view{{build->view()}};
   auto probe_table = cudf::table_view{{probe->view()}};
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{
-    build_table, probe_table, cudf::nullable_join::NO};
+  auto distinct_join = cudf::distinct_hash_join{build_table};
 
-  auto result = distinct_join.inner_join();
+  auto result = distinct_join.inner_join(probe_table);
 
   auto constexpr gold_size = size / 2;
   auto gold                = cudf::sequence(gold_size, init, cudf::numeric_scalar<int32_t>{2});
@@ -120,8 +119,8 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{1, 2}};
   strcol_wrapper col_gold_1({"s0", "s0"});
@@ -162,8 +161,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
@@ -229,8 +228,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
@@ -284,8 +283,8 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   this->compare_to_reference(build.view(), probe.view(), result, build.view());
 }
@@ -307,9 +306,9 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -332,8 +331,8 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   this->compare_to_reference(build.view(), probe.view(), result, probe.view());
 }
@@ -355,9 +354,9 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -391,9 +390,9 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
   cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -416,9 +415,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
   strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
@@ -461,9 +460,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   auto col0_gold_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 37414eb3fba..c146fd2ea4e 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -33,6 +33,7 @@
 
 #include <arrow/util/tdigest.h>
 
+namespace {
 std::unique_ptr<cudf::column> arrow_percentile_approx(cudf::column_view const& _values,
                                                       int delta,
                                                       std::vector<double> const& percentages)
@@ -315,6 +316,7 @@ cudf::data_type get_appropriate_type()
   if constexpr (cudf::is_fixed_point<T>()) { return cudf::data_type{cudf::type_to_id<T>(), -7}; }
   return cudf::data_type{cudf::type_to_id<T>()};
 }
+}  // namespace
 
 using PercentileApproxTypes =
   cudf::test::Concat<cudf::test::NumericTypes, cudf::test::FixedPointTypes>;
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index c8fec51e1c9..184725e17e0 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@ template <typename T>
 struct ReductionTDigestAllTypes : public cudf::test::BaseFixture {};
 TYPED_TEST_SUITE(ReductionTDigestAllTypes, cudf::test::NumericTypes);
 
+namespace {
 struct reduce_op {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& values, int delta) const
   {
@@ -60,6 +61,7 @@ struct reduce_merge_op {
     return cudf::make_structs_column(tbl.num_rows(), std::move(cols), 0, rmm::device_buffer());
   }
 };
+}  // namespace
 
 TYPED_TEST(ReductionTDigestAllTypes, Simple)
 {
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index 7133baf6df1..79ea6b7d6d4 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -23,9 +23,11 @@
 
 #include <dlpack/dlpack.h>
 
+namespace {
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
+}  // namespace
 
 struct DLPackTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp
index 89f76237de6..e3fdc177b50 100644
--- a/cpp/tests/streams/replace_test.cpp
+++ b/cpp/tests/streams/replace_test.cpp
@@ -104,9 +104,9 @@ TEST_F(ReplaceTest, NormalizeNansAndZeros)
 
 TEST_F(ReplaceTest, NormalizeNansAndZerosMutable)
 {
-  auto nan          = std::numeric_limits<double>::quiet_NaN();
-  auto input_column = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
-  cudf::test::fixed_width_column_wrapper<double> input(input_column.begin(), input_column.end());
-  cudf::mutable_column_view mutable_view = cudf::column(input, cudf::test::get_default_stream());
-  cudf::normalize_nans_and_zeros(mutable_view, cudf::test::get_default_stream());
+  auto nan   = std::numeric_limits<double>::quiet_NaN();
+  auto data  = cudf::test::make_type_param_vector<double>({-0.0, 0.0, -nan, nan, nan});
+  auto input = cudf::test::fixed_width_column_wrapper<double>(data.begin(), data.end()).release();
+  auto view  = input->mutable_view();
+  cudf::normalize_nans_and_zeros(view, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 042ac44621e..8bfb17e0efd 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -44,10 +44,9 @@ TEST_F(MinHashTest, Permuted)
 
   auto view = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(10);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(10);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -66,9 +65,9 @@ TEST_F(MinHashTest, Permuted)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -95,10 +94,9 @@ TEST_F(MinHashTest, PermutedWide)
   auto input = cudf::test::strings_column_wrapper({small, wide});
   auto view  = cudf::strings_column_view(input);
 
-  auto first  = thrust::counting_iterator<uint32_t>(20);
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto first   = thrust::counting_iterator<uint32_t>(20);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -109,9 +107,9 @@ TEST_F(MinHashTest, PermutedWide)
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -132,9 +130,8 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
   auto first = thrust::counting_iterator<uint32_t>(20);
   // more than params_per_thread
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
   using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
@@ -152,9 +149,9 @@ TEST_F(MinHashTest, PermutedManyParameters)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   // more than params_per_thread
-  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
-  auto results64 = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 =
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -182,15 +179,13 @@ TEST_F(MinHashTest, PermutedManyParameters)
 
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view   = cudf::strings_column_view(input->view());
-  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
-  auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view    = cudf::strings_column_view(input->view());
+  auto params  = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results = nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
   auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
-  results       = nvtext::minhash64_permuted(
-    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+  results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -199,18 +194,16 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
-    std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+               std::invalid_argument);
   auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
   EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
     std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
@@ -219,18 +212,16 @@ TEST_F(MinHashTest, ErrorsTest)
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
   auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+               std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
     std::overflow_error);
-  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64_permuted(
-                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
-               std::overflow_error);
 
+  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+               std::invalid_argument);
   EXPECT_THROW(
-    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
-    std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
     std::invalid_argument);
 }
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 01a042130d6..7e203086fca 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -590,6 +590,7 @@ TEST_F(RowBitCount, EmptyChildColumnInListOfLists)
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{32, 32, 32, 32});
 }
 
+namespace {
 struct sum_functor {
   cudf::size_type const* s0;
   cudf::size_type const* s1;
@@ -597,6 +598,7 @@ struct sum_functor {
 
   cudf::size_type operator() __device__(int i) { return s0[i] + s1[i] + s2[i]; }
 };
+}  // namespace
 
 TEST_F(RowBitCount, Table)
 {
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index cfab570833b..b5d20325b75 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,29 +16,25 @@
 
 #include <cudf_test/base_fixture.hpp>
 
-#include <cudf/detail/utilities/logger.hpp>
-
-#include <spdlog/sinks/ostream_sink.h>
+#include <cudf/logger.hpp>
 
 #include <string>
 
 class LoggerTest : public cudf::test::BaseFixture {
   std::ostringstream oss;
-  spdlog::level::level_enum prev_level;
-  std::vector<spdlog::sink_ptr> prev_sinks;
+  cudf::level_enum prev_level;
 
  public:
-  LoggerTest()
-    : prev_level{cudf::detail::logger().level()}, prev_sinks{cudf::detail::logger().sinks()}
+  LoggerTest() : prev_level{cudf::default_logger().level()}
   {
-    cudf::detail::logger().sinks() = {std::make_shared<spdlog::sinks::ostream_sink_mt>(oss)};
-    cudf::detail::logger().set_formatter(
-      std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
+    cudf::default_logger().sinks().push_back(std::make_shared<cudf::ostream_sink_mt>(oss));
+    cudf::default_logger().set_pattern("%v");
   }
   ~LoggerTest() override
   {
-    cudf::detail::logger().set_level(prev_level);
-    cudf::detail::logger().sinks() = prev_sinks;
+    cudf::default_logger().set_pattern("[%6t][%H:%M:%S:%f][%-6l] %v");
+    cudf::default_logger().set_level(prev_level);
+    cudf::default_logger().sinks().pop_back();
   }
 
   void clear_sink() { oss.str(""); }
@@ -47,32 +43,32 @@ class LoggerTest : public cudf::test::BaseFixture {
 
 TEST_F(LoggerTest, Basic)
 {
-  cudf::detail::logger().critical("crit msg");
+  cudf::default_logger().critical("crit msg");
   ASSERT_EQ(this->sink_content(), "crit msg\n");
 }
 
 TEST_F(LoggerTest, DefaultLevel)
 {
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
-  cudf::detail::logger().error("error");
-  cudf::detail::logger().critical("critical");
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
+  cudf::default_logger().error("error");
+  cudf::default_logger().critical("critical");
   ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
 {
-  cudf::detail::logger().set_level(spdlog::level::warn);
-  cudf::detail::logger().info("info");
-  cudf::detail::logger().warn("warn");
+  cudf::default_logger().set_level(cudf::level_enum::warn);
+  cudf::default_logger().info("info");
+  cudf::default_logger().warn("warn");
   ASSERT_EQ(this->sink_content(), "warn\n");
 
   this->clear_sink();
 
-  cudf::detail::logger().set_level(spdlog::level::debug);
-  cudf::detail::logger().trace("trace");
-  cudf::detail::logger().debug("debug");
+  cudf::default_logger().set_level(cudf::level_enum::debug);
+  cudf::default_logger().trace("trace");
+  cudf::default_logger().debug("debug");
   ASSERT_EQ(this->sink_content(), "debug\n");
 }
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 4086c5a91bb..8e5129dfbd2 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -37,6 +37,7 @@
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 
+namespace {
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
   cudf::size_type size() { return cudf::size_type(100); }
@@ -72,6 +73,7 @@ struct compare_chrono_elements_to_primitive_representation {
     return primitive == dur.count();
   }
 };
+}  // namespace
 
 TYPED_TEST_SUITE(ChronoColumnTest, cudf::test::ChronoTypes);
 
@@ -103,6 +105,7 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
                                *cudf::column_device_view::create(chrono_col)}));
 }
 
+namespace {
 template <typename ChronoT>
 struct compare_chrono_elements {
   cudf::binary_operator comp;
@@ -129,6 +132,7 @@ struct compare_chrono_elements {
     }
   }
 };
+}  // namespace
 
 TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
 {
diff --git a/dependencies.yaml b/dependencies.yaml
index 044c7d187b3..4672a355c72 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -413,7 +413,6 @@ dependencies:
           - fmt>=11.0.2,<12
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
-          - spdlog>=1.14.1,<1.15
   depends_on_nvcomp:
     common:
       - output_types: conda
@@ -679,17 +678,17 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_pylibcudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.0.13,<0.0.18
+          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -705,10 +704,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cuda-python>=12.0,<13.0a0
+              - cuda-python>=12.6.2,<13.0a0
           - matrix: {cuda: "11.*"}
             packages: &run_cudf_packages_all_cu11
-              - cuda-python>=11.7.1,<12.0a0
+              - cuda-python>=11.8.5,<12.0a0
           - {matrix: null, packages: *run_cudf_packages_all_cu11}
       - output_types: conda
         matrices:
@@ -748,7 +747,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.15
+          - polars>=1.11,<1.18
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -758,7 +757,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - pynvml>=11.4.1,<12.0.0a0
+          - pynvml>=12.0.0,<13.0.0a0
           - rapids-dask-dependency==25.2.*,>=0.0.0a0
   run_custreamz:
     common:
@@ -811,11 +810,11 @@ dependencies:
         matrices:
           - matrix: {dependencies: "oldest"}
             packages:
-              - *numba-cuda-dep
+              - numba-cuda==0.2.0
               - pandas==2.0.*
           - matrix: {dependencies: "latest"}
             packages:
-              - numba-cuda==0.0.15
+              - *numba-cuda-dep
               - pandas==2.2.3
           - matrix:
             packages:
@@ -879,7 +878,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numpy==1.23.*
-              - pyarrow==14.0.0
+              - pyarrow==14.*
           - matrix:
             packages:
       - output_types: conda
@@ -904,7 +903,7 @@ dependencies:
           - matrix: {dependencies: "oldest"}
             packages:
               - numpy==1.24.*
-              - pyarrow==14.0.1
+              - pyarrow==14.*
           - matrix:
             packages:
   test_python_cudf_polars:
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index fbb9ca4b128..09214803c0c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -439,6 +439,8 @@ def _generate_namespaces(namespaces):
     # Sphinx doesn't know how to distinguish between the ORC and Parquet
     # definitions because Breathe doesn't to preserve namespaces for enums.
     "TypeKind",
+    # Span subclasses access base class members
+    "base::",
 }
 
 _domain_objects = None
@@ -594,6 +596,8 @@ def on_missing_reference(app, env, node, contnode):
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
+    ("py:class", "np.uint32"),
+    ("py:class", "np.uint64"),
 ]
 
 
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 5024747227e..222b698a78d 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -63,11 +63,11 @@ keyword arguments, cuDF is not able to provide GPU acceleration and
 `cudf.pandas` will fall back to the CPU.
 
 The most accurate way to assess which functions run on the GPU is to try
-running the code while using the `cudf.pandas` profiling features. The
-profiler will indicate which functions ran on GPU / CPU. To improve
-performance, try to use only functionality that can run entirely on GPU.
-This helps reduce the number of memory transfers needed to fallback to
-CPU.
+running the code while using the `cudf.pandas` [profiling
+features](cudf-pandas-profiling). The profiler will indicate which functions
+ran on GPU / CPU. To improve performance, try to use only functionality that
+can run entirely on GPU.  This helps reduce the number of memory transfers
+needed to fallback to CPU.
 
 ## How can I improve performance of my workflow with `cudf.pandas`?
 
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 089f283e25d..fed63c2dd0f 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -75,6 +75,7 @@ with Pool(4) as pool:
     ...
 ```
 
+(cudf-pandas-profiling)=
 ## Profiling `cudf.pandas`
 
 `cudf.pandas` will attempt to use the GPU whenever possible and fall
diff --git a/docs/cudf/source/user_guide/api_docs/general_functions.rst b/docs/cudf/source/user_guide/api_docs/general_functions.rst
index 38e070b0d53..5c5b5cb3b04 100644
--- a/docs/cudf/source/user_guide/api_docs/general_functions.rst
+++ b/docs/cudf/source/user_guide/api_docs/general_functions.rst
@@ -9,26 +9,26 @@ Data manipulations
 .. autosummary::
    :toctree: api/
 
-   cudf.concat
-   cudf.crosstab
-   cudf.cut
-   cudf.factorize
-   cudf.get_dummies
-   cudf.melt
-   cudf.merge
-   cudf.pivot
-   cudf.pivot_table
-   cudf.unstack
+   concat
+   crosstab
+   cut
+   factorize
+   get_dummies
+   melt
+   merge
+   pivot
+   pivot_table
+   unstack
 
 Top-level conversions
 ---------------------
 .. autosummary::
    :toctree: api/
 
-    cudf.to_numeric
-    cudf.from_dataframe
-    cudf.from_dlpack
-    cudf.from_pandas
+   to_numeric
+   from_dataframe
+   from_dlpack
+   from_pandas
 
 Top-level dealing with datetimelike data
 ----------------------------------------
@@ -36,8 +36,8 @@ Top-level dealing with datetimelike data
 .. autosummary::
    :toctree: api/
 
-    cudf.to_datetime
-    cudf.date_range
+   to_datetime
+   date_range
 
 Top-level dealing with Interval data
 ------------------------------------
@@ -45,4 +45,4 @@ Top-level dealing with Interval data
 .. autosummary::
    :toctree: api/
 
-    cudf.interval_range
+   interval_range
diff --git a/docs/cudf/source/user_guide/api_docs/io.rst b/docs/cudf/source/user_guide/api_docs/io.rst
index 417970715f8..ad8ba8a9bdf 100644
--- a/docs/cudf/source/user_guide/api_docs/io.rst
+++ b/docs/cudf/source/user_guide/api_docs/io.rst
@@ -35,10 +35,10 @@ Parquet
 
    read_parquet
    DataFrame.to_parquet
-   cudf.io.parquet.read_parquet_metadata
-   cudf.io.parquet.ParquetDatasetWriter
-   cudf.io.parquet.ParquetDatasetWriter.close
-   cudf.io.parquet.ParquetDatasetWriter.write_table
+   io.parquet.read_parquet_metadata
+   io.parquet.ParquetDatasetWriter
+   io.parquet.ParquetDatasetWriter.close
+   io.parquet.ParquetDatasetWriter.write_table
 
 
 ORC
diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java
index 379750bb0b7..2276b223740 100644
--- a/java/src/main/java/ai/rapids/cudf/Aggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java
@@ -62,15 +62,16 @@ enum Kind {
         LAG(23),
         PTX(24),
         CUDA(25),
-        M2(26),
-        MERGE_M2(27),
-        RANK(28),
-        DENSE_RANK(29),
-        PERCENT_RANK(30),
-        TDIGEST(31), // This can take a delta argument for accuracy level
-        MERGE_TDIGEST(32), // This can take a delta argument for accuracy level
-        HISTOGRAM(33),
-        MERGE_HISTOGRAM(34);
+        HOST_UDF(26),
+        M2(27),
+        MERGE_M2(28),
+        RANK(29),
+        DENSE_RANK(30),
+        PERCENT_RANK(31),
+        TDIGEST(32), // This can take a delta argument for accuracy level
+        MERGE_TDIGEST(33), // This can take a delta argument for accuracy level
+        HISTOGRAM(34),
+        MERGE_HISTOGRAM(35);
 
         final int nativeId;
 
@@ -385,6 +386,35 @@ public boolean equals(Object other) {
         }
     }
 
+    static final class HostUDFAggregation extends Aggregation {
+        private final HostUDFWrapper wrapper;
+
+        private HostUDFAggregation(HostUDFWrapper wrapper) {
+            super(Kind.HOST_UDF);
+            this.wrapper = wrapper;
+        }
+
+        @Override
+        long createNativeInstance() {
+            return Aggregation.createHostUDFAgg(wrapper.udfNativeHandle);
+        }
+
+        @Override
+        public int hashCode() {
+            return 31 * kind.hashCode() + wrapper.hashCode();
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            if (this == other) {
+                return true;
+            } else if (other instanceof HostUDFAggregation) {
+                return wrapper.equals(((HostUDFAggregation) other).wrapper);
+            }
+            return false;
+        }
+    }
+
     protected final Kind kind;
 
     protected Aggregation(Kind kind) {
@@ -837,6 +867,15 @@ static MergeSetsAggregation mergeSets(NullEquality nullEquality, NaNEquality nan
         return new MergeSetsAggregation(nullEquality, nanEquality);
     }
 
+    /**
+     * Host UDF aggregation, to execute a host-side user-defined function (UDF).
+     * @param wrapper The wrapper for the native host UDF instance.
+     * @return A new HostUDFAggregation instance
+     */
+    static HostUDFAggregation hostUDF(HostUDFWrapper wrapper) {
+        return new HostUDFAggregation(wrapper);
+    }
+
     static final class LeadAggregation extends LeadLagAggregation {
         private LeadAggregation(int offset, ColumnVector defaultOutput) {
             super(Kind.LEAD, offset, defaultOutput);
@@ -990,4 +1029,9 @@ static MergeHistogramAggregation mergeHistogram() {
      * Create a TDigest aggregation.
      */
     private static native long createTDigestAgg(int kind, int delta);
+
+    /**
+     * Create a HOST_UDF aggregation.
+     */
+    private static native long createHostUDFAgg(long udfNativeHandle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
index 0fae33927b6..27966ddfdd4 100644
--- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java
@@ -277,6 +277,15 @@ public static GroupByAggregation mergeSets() {
     return new GroupByAggregation(Aggregation.mergeSets());
   }
 
+  /**
+   * Execute an aggregation using a host-side user-defined function (UDF).
+   * @param wrapper The wrapper for the native host UDF instance.
+   * @return A new GroupByAggregation instance
+   */
+  public static GroupByAggregation hostUDF(HostUDFWrapper wrapper) {
+    return new GroupByAggregation(Aggregation.hostUDF(wrapper));
+  }
+
   /**
    * Merge the partial sets produced by multiple CollectSetAggregations.
    *
diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
new file mode 100644
index 00000000000..0b6ecf2e140
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * A wrapper around native host UDF aggregations.
+ * <p>
+ * This class is used to store the native handle of a host UDF aggregation and is used as
+ * a proxy object to compute hash code and compare two host UDF aggregations for equality.
+ * <p>
+ * A new host UDF aggregation implementation must extend this class and override the
+ * {@code hashCode} and {@code equals} methods for such purposes.
+ */
+public abstract class HostUDFWrapper {
+  public final long udfNativeHandle;
+
+  public HostUDFWrapper(long udfNativeHandle) {
+    this.udfNativeHandle = udfNativeHandle;
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index 53af52eff07..5e544e92a77 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
    * @param filePath Full path of the input Parquet file to read.
    */
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) {
-    handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
-
+    long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
                               ParquetOptions opts, HostMemoryBuffer buffer,
                               long offset, long len) {
-    handle = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
-        buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+    multiHostBufferSourceHandle = handles[1];
+  }
 
+  /**
+   * Construct the reader instance from a read limit and data in host memory buffers.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or
+   *                      0 if there is no limit
+   * @param opts The options for Parquet reading.
+   * @param buffers Array of buffers containing the file data. The buffers are logically
+   *                concatenated to construct the file being read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
+                              ParquetOptions opts, HostMemoryBuffer... buffers) {
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -181,6 +211,10 @@ public void close() {
       DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
       dataSourceHandle = 0;
     }
+    if (multiHostBufferSourceHandle != 0) {
+      destroyMultiHostBufferSource(multiHostBufferSourceHandle);
+      multiHostBufferSourceHandle = 0;
+    }
   }
 
 
@@ -196,6 +230,8 @@ public void close() {
 
   private long dataSourceHandle = 0;
 
+  private long multiHostBufferSourceHandle = 0;
+
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
    *
@@ -206,13 +242,12 @@ public void close() {
    * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
    * @param binaryToString Whether to convert the corresponding column to String if it is binary.
    * @param filePath Full path of the file to read, or given as null if reading from a buffer.
-   * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer.
-   * @param length The length of the buffer to read from.
+   * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers.
    * @param timeUnit Return type of time unit for timestamps.
    */
-  private static native long create(long chunkSizeByteLimit, long passReadLimit,
-                                    String[] filterColumnNames, boolean[] binaryToString,
-                                    String filePath, long bufferAddrs, long length, int timeUnit);
+  private static native long[] create(long chunkSizeByteLimit, long passReadLimit,
+                                      String[] filterColumnNames, boolean[] binaryToString,
+                                      String filePath, long[] bufferAddrsSizes, int timeUnit);
 
   private static native long createWithDataSource(long chunkedSizeByteLimit,
       String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
@@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit,
   private static native long[] readChunk(long handle);
 
   private static native void close(long handle);
+
+  private static native void destroyMultiHostBufferSource(long handle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index ed029c918e4..d1cc0cc96fe 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -206,7 +206,8 @@ private static void setGlobalValsFromResource(RmmDeviceMemoryResource resource)
    *                       {@link RmmAllocationMode#CUDA_DEFAULT},
    *                       {@link RmmAllocationMode#POOL},
    *                       {@link RmmAllocationMode#ARENA},
-   *                       {@link RmmAllocationMode#CUDA_ASYNC} and
+   *                       {@link RmmAllocationMode#CUDA_ASYNC},
+   *                       {@link RmmAllocationMode#CUDA_ASYNC_FABRIC} and
    *                       {@link RmmAllocationMode#CUDA_MANAGED_MEMORY}
    * @param logConf        How to do logging or null if you don't want to
    * @param poolSize       The initial pool size in bytes
@@ -221,6 +222,7 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     boolean isPool = (allocationMode & RmmAllocationMode.POOL) != 0;
     boolean isArena = (allocationMode & RmmAllocationMode.ARENA) != 0;
     boolean isAsync = (allocationMode & RmmAllocationMode.CUDA_ASYNC) != 0;
+    boolean isAsyncFabric = (allocationMode & RmmAllocationMode.CUDA_ASYNC_FABRIC) != 0;
     boolean isManaged = (allocationMode & RmmAllocationMode.CUDA_MANAGED_MEMORY) != 0;
 
     if (isAsync && isManaged) {
@@ -246,6 +248,9 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
       } else if (isAsync) {
         resource = new RmmLimitingResourceAdaptor<>(
             new RmmCudaAsyncMemoryResource(poolSize, poolSize), poolSize, 512);
+      } else if (isAsyncFabric) {
+        resource = new RmmLimitingResourceAdaptor<>(
+            new RmmCudaAsyncMemoryResource(poolSize, poolSize, true), poolSize, 512);
       } else if (isManaged) {
         resource = new RmmManagedMemoryResource();
       } else {
@@ -521,7 +526,6 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
 
   private static native long allocInternal(long size, long stream) throws RmmException;
 
-
   static native void free(long ptr, long length, long stream) throws RmmException;
 
   /**
@@ -562,7 +566,7 @@ static native long newArenaMemoryResource(long childHandle,
 
   static native void releaseArenaMemoryResource(long handle);
 
-  static native long newCudaAsyncMemoryResource(long size, long release) throws RmmException;
+  static native long newCudaAsyncMemoryResource(long size, long release, boolean fabric) throws RmmException;
 
   static native void releaseCudaAsyncMemoryResource(long handle);
 
@@ -575,7 +579,6 @@ static native long newLoggingResourceAdaptor(long handle, int type, String path,
 
   static native void releaseLoggingResourceAdaptor(long handle);
 
-
   static native long newTrackingResourceAdaptor(long handle, long alignment) throws RmmException;
 
   static native void releaseTrackingResourceAdaptor(long handle);
diff --git a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
index 966c21bee22..3f7bc1fae76 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmAllocationMode.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,4 +36,9 @@ public class RmmAllocationMode {
    * Use CUDA async suballocation strategy
    */
   public static final int CUDA_ASYNC = 0x00000008;
+  /**
+   * Use CUDA async suballocation strategy with fabric handles that are
+   * peer accessible with read-write access
+   */
+  public static final int CUDA_ASYNC_FABRIC = 0x00000010;
 }
diff --git a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
index fa1f13cb7ed..cf4936e2e24 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmCudaAsyncMemoryResource.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,9 +30,20 @@ public class RmmCudaAsyncMemoryResource implements RmmDeviceMemoryResource {
    * @param releaseThreshold size in bytes for when memory is released back to cuda
    */
   public RmmCudaAsyncMemoryResource(long size, long releaseThreshold) {
+    this(size, releaseThreshold, false);
+  }
+
+  /**
+   * Create a new async memory resource
+   * @param size the initial size of the pool
+   * @param releaseThreshold size in bytes for when memory is released back to cuda
+   * @param fabric if true request peer read+write accessible fabric handles when
+   *        creating the pool
+   */
+  public RmmCudaAsyncMemoryResource(long size, long releaseThreshold, boolean fabric) {
     this.size = size;
     this.releaseThreshold = releaseThreshold;
-    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold);
+    handle = Rmm.newCudaAsyncMemoryResource(size, releaseThreshold, fabric);
   }
 
   @Override
diff --git a/java/src/main/java/ai/rapids/cudf/Scalar.java b/java/src/main/java/ai/rapids/cudf/Scalar.java
index 286b5c208c9..f3155bc5860 100644
--- a/java/src/main/java/ai/rapids/cudf/Scalar.java
+++ b/java/src/main/java/ai/rapids/cudf/Scalar.java
@@ -521,13 +521,28 @@ private static ColumnVector buildNullColumnVector(HostColumnVector.DataType host
   private static native long makeStructScalar(long[] viewHandles, boolean isValid);
   private static native long repeatString(long scalarHandle, int repeatTimes);
 
-  Scalar(DType type, long scalarHandle) {
+  /**
+   * Constructor to create a scalar from a native handle and a type.
+   *
+   * @param type The type of the scalar
+   * @param scalarHandle The native handle (pointer address) to the scalar data
+   */
+  public Scalar(DType type, long scalarHandle) {
     this.type = type;
     this.offHeap = new OffHeapState(scalarHandle);
     MemoryCleaner.register(this, offHeap);
     incRefCount();
   }
 
+  /**
+   * Get the native handle (native pointer address) for the scalar.
+   *
+   * @return The native handle
+   */
+  public long getScalarHandle() {
+    return offHeap.scalarHandle;
+  }
+
   /**
    * Increment the reference count for this scalar.  You need to call close on this
    * to decrement the reference count again.
@@ -542,10 +557,6 @@ public synchronized Scalar incRefCount() {
     return this;
   }
 
-  long getScalarHandle() {
-    return offHeap.scalarHandle;
-  }
-
   /**
    * Free the memory associated with a scalar.
    */
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b01ce31b1f3..298f2cff6f3 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length,
    *                           all of them
    * @param binaryToString     whether to convert this column to String if binary
    * @param filePath           the path of the file to read, or null if no path should be read.
-   * @param address            the address of the buffer to read from or 0 if we should not.
-   * @param length             the length of the buffer to read from.
+   * @param addrsAndSizes      the address and size pairs for every buffer or null for no buffers.
    * @param timeUnit           return type of TimeStamp in units
    */
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
-                                           long address, long length, int timeUnit) throws CudfException;
+                                           long[] addrsAndSizes, int timeUnit) throws CudfException;
 
   private static native long[] readParquetFromDataSource(String[] filterColumnNames,
                                                          boolean[] binaryToString, int timeUnit,
@@ -1357,7 +1356,7 @@ public static Table readParquet(File path) {
    */
   public static Table readParquet(ParquetOptions opts, File path) {
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
+        path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()));
   }
 
   /**
@@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset,
     }
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffer raw parquet formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
     return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
   }
@@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
     assert len > 0;
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
+  }
+
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated
+   *                in order to construct the file being read.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) {
+    assert buffers.length > 0;
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param ds custom datasource to provide the Parquet file data
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, DataSource ds) {
     long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
     try {
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 9ff43feeac6..bd1714aa476 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -156,8 +156,9 @@ add_library(
   src/ScalarJni.cpp
   src/TableJni.cpp
   src/aggregation128_utils.cu
-  src/maps_column_view.cu
   src/check_nvcomp_output_sizes.cu
+  src/maps_column_view.cu
+  src/multi_host_buffer_source.cpp
 )
 
 # Disable NVTX if necessary
diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp
new file mode 100644
index 00000000000..2aedb2321e4
--- /dev/null
+++ b/java/src/main/native/include/multi_host_buffer_source.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "jni_utils.hpp"
+
+#include <cudf/io/datasource.hpp>
+
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+/**
+ * @brief A custom datasource providing data from an array of host memory buffers.
+ */
+class multi_host_buffer_source : public cudf::io::datasource {
+  std::vector<uint8_t const*> addrs_;
+  std::vector<size_t> offsets_;
+
+  size_t locate_offset_index(size_t offset);
+
+ public:
+  explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes);
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
+  bool supports_device_read() const override { return true; }
+  bool is_device_read_preferred(size_t size) const override { return true; }
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override;
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override;
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override;
+  size_t size() const override { return offsets_.back(); }
+};
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index c40f1c55500..dd41c677761 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -17,6 +17,7 @@
 #include "cudf_jni_apis.hpp"
 
 #include <cudf/aggregation.hpp>
+#include <cudf/aggregation/host_udf.hpp>
 
 extern "C" {
 
@@ -80,25 +81,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv*
         // case 23: LAG
         // case 24: PTX
         // case 25: CUDA
-        case 26:  // M2
+        // case 26: HOST_UDF
+        case 27:  // M2
           return cudf::make_m2_aggregation();
-        case 27:  // MERGE_M2
+        case 28:  // MERGE_M2
           return cudf::make_merge_m2_aggregation();
-        case 28:  // RANK
+        case 29:  // RANK
           return cudf::make_rank_aggregation(
             cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE);
-        case 29:  // DENSE_RANK
+        case 30:  // DENSE_RANK
           return cudf::make_rank_aggregation(
             cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE);
-        case 30:  // ANSI SQL PERCENT_RANK
+        case 31:  // ANSI SQL PERCENT_RANK
           return cudf::make_rank_aggregation(cudf::rank_method::MIN,
                                              {},
                                              cudf::null_policy::INCLUDE,
                                              {},
                                              cudf::rank_percentage::ONE_NORMALIZED);
-        case 33:  // HISTOGRAM
+        // case 32: TDIGEST
+        // case 33: MERGE_TDIGEST
+        case 34:  // HISTOGRAM
           return cudf::make_histogram_aggregation();
-        case 34:  // MERGE_HISTOGRAM
+        case 35:  // MERGE_HISTOGRAM
           return cudf::make_merge_histogram_aggregation();
 
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
@@ -160,10 +164,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv*
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 31:  // TDIGEST
+      case 32:  // TDIGEST
         ret = cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
-      case 32:  // MERGE_TDIGEST
+      case 33:  // MERGE_TDIGEST
         ret = cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
       default: throw std::logic_error("Unsupported TDigest Aggregation Operation");
@@ -296,4 +300,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEn
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createHostUDFAgg(JNIEnv* env,
+                                                                         jclass class_object,
+                                                                         jlong udf_native_handle)
+{
+  JNI_NULL_CHECK(env, udf_native_handle, "udf_native_handle is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const udf_ptr = reinterpret_cast<cudf::host_udf_base const*>(udf_native_handle);
+    auto output        = cudf::make_host_udf_aggregation(udf_ptr->clone());
+    return reinterpret_cast<jlong>(output.release());
+  }
+  CATCH_STD(env, 0);
+}
+
 }  // extern "C"
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index cf04a87262f..4967e0b2b04 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/orc.hpp>
@@ -36,7 +37,7 @@ extern "C" {
 
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL
+JNIEXPORT jlongArray JNICALL
 Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jclass,
                                                 jlong chunk_read_limit,
@@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jobjectArray filter_col_names,
                                                 jbooleanArray j_col_binary_read,
                                                 jstring inp_file_path,
-                                                jlong buffer,
-                                                jlong buffer_length,
+                                                jlongArray addrs_sizes,
                                                 jint unit)
 {
-  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr);
   bool read_buffer = true;
-  if (buffer == 0) {
-    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
+  if (addrs_sizes == nullptr) {
+    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(
-      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env,
+                  cudf::jni::ILLEGAL_ARG_CLASS,
+                  "Cannot pass in both buffers and an inp_file_path",
+                  nullptr);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                            static_cast<std::size_t>(buffer_length))
-                                    : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
                              .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
                              .build();
-
-    return reinterpret_cast<jlong>(
+    n_addrs_sizes.cancel();
+    n_col_binary_read.cancel();
+    auto reader_handle = reinterpret_cast<jlong>(
       new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
                                            static_cast<std::size_t>(pass_read_limit),
                                            read_opts));
+    cudf::jni::native_jlongArray result(env, 2);
+    result[0] = reader_handle;
+    result[1] = cudf::jni::release_as_jlong(multi_buffer_source);
+    return result.get_jArray();
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT jlong JNICALL
@@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource(
+  JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    delete reinterpret_cast<cudf::jni::multi_host_buffer_source*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 //
 // Chunked ORC reader JNI
 //
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 23c7b7fb243..8c733018fa7 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -772,14 +772,18 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
-                                                                           jclass clazz,
-                                                                           jlong init,
-                                                                           jlong release)
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(
+  JNIEnv* env, jclass clazz, jlong init, jlong release, jboolean fabric)
 {
   try {
     cudf::jni::auto_set_device(env);
-    auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
+
+    auto handle_type =
+      fabric ? std::optional{rmm::mr::cuda_async_memory_resource::allocation_handle_type::fabric}
+             : std::nullopt;
+
+    auto ret = new rmm::mr::cuda_async_memory_resource(init, release, handle_type);
+
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 1f8b1ea207d..a6c7ae9ba18 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "jni_compiled_expr.hpp"
 #include "jni_utils.hpp"
 #include "jni_writer_data_sink.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
                                                                    jobjectArray filter_col_names,
                                                                    jbooleanArray j_col_binary_read,
                                                                    jstring inputfilepath,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
+                                                                   jlongArray addrs_and_sizes,
                                                                    jint unit)
 {
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
-  if (buffer == 0) {
+  if (addrs_and_sizes == nullptr) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
     JNI_THROW_NEW(
       env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
 
   try {
@@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
-
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                      static_cast<std::size_t>(buffer_length))
-                              : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
       builder.convert_strings_to_categories(false)
         .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
         .build();
-    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+    auto tbl = cudf::io::read_parquet(opts).tbl;
+    n_col_binary_read.cancel();
+    n_addrs_sizes.cancel();
+    return convert_table_for_return(env, tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -2901,16 +2907,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap
     j_right_keys,
     compare_nulls_equal,
     [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
-      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
-                         ? cudf::nullable_join::YES
-                         : cudf::nullable_join::NO;
-      if (cudf::has_nested_columns(right)) {
-        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-        return hash.left_join();
-      } else {
-        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-        return hash.left_join();
-      }
+      cudf::distinct_hash_join hash(right, nulleq);
+      return hash.left_join(left);
     });
 }
 
@@ -3119,22 +3117,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa
     j_right_keys,
     compare_nulls_equal,
     [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
-      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
-                         ? cudf::nullable_join::YES
-                         : cudf::nullable_join::NO;
-      std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        maps;
-      if (cudf::has_nested_columns(right)) {
-        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-        maps = hash.inner_join();
-      } else {
-        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-        maps = hash.inner_join();
-      }
-      // Unique join returns {right map, left map} but all the other joins
-      // return {left map, right map}. Swap here to make it consistent.
-      return std::make_pair(std::move(maps.second), std::move(maps.first));
+      cudf::distinct_hash_join hash(right, nulleq);
+      return hash.inner_join(left);
     });
 }
 
diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp
new file mode 100644
index 00000000000..c577fc680ba
--- /dev/null
+++ b/java/src/main/native/src/multi_host_buffer_source.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multi_host_buffer_source.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes)
+{
+  if (addrs_sizes.size() % 2 != 0) {
+    throw std::logic_error("addrs_sizes length not a multiple of 2");
+  }
+  auto count = addrs_sizes.size() / 2;
+  addrs_.reserve(count);
+  offsets_.reserve(count + 1);
+  size_t total_size = 0;
+  for (int i = 0; i < addrs_sizes.size(); i += 2) {
+    addrs_.push_back(reinterpret_cast<uint8_t const*>(addrs_sizes[i]));
+    offsets_.push_back(total_size);
+    total_size += addrs_sizes[i + 1];
+  }
+  offsets_.push_back(total_size);
+}
+
+size_t multi_host_buffer_source::locate_offset_index(size_t offset)
+{
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto start = offsets_.begin();
+  auto it    = std::upper_bound(start, offsets_.end(), offset);
+  return (it - start) - 1;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::host_read(size_t offset,
+                                                                                  size_t size)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto const end_offset = offset + size;
+  if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto next_offset  = offsets_[buffer_index + 1];
+  if (end_offset <= next_offset) {
+    // read range hits only a single buffer, so return a zero-copy view of the data
+    auto src = addrs_[buffer_index] + offset - offsets_[buffer_index];
+    return std::make_unique<non_owning_buffer>(src, size);
+  }
+  auto buf        = std::vector<uint8_t>(size);
+  auto bytes_read = host_read(offset, size, buf.data());
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected host read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<std::vector<uint8_t>>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    std::memcpy(dst, src, copy_size);
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::device_read(
+  size_t offset, size_t size, rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer buf(size, stream);
+  auto dst        = static_cast<uint8_t*>(buf.data());
+  auto bytes_read = device_read(offset, size, dst, stream);
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected device read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<rmm::device_buffer>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::device_read(size_t offset,
+                                             size_t size,
+                                             uint8_t* dst,
+                                             rmm::cuda_stream_view stream)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value()));
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::future<size_t> multi_host_buffer_source::device_read_async(size_t offset,
+                                                                size_t size,
+                                                                uint8_t* dst,
+                                                                rmm::cuda_stream_view stream)
+{
+  std::promise<size_t> p;
+  p.set_value(device_read(offset, size, dst, stream));
+  return p.get_future();
+}
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c7fcb1756b6..7eb32892bad 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,8 +47,11 @@
 import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
@@ -1714,6 +1717,42 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetHostBuffers() throws Exception {
+    long size = TEST_PARQUET_FILE_CHUNKED_READ.length();
+    java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath();
+    try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2);
+         HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) {
+      try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) {
+        ByteBuffer bb1 = buf1.asByteBuffer();
+        while (bb1.hasRemaining()) {
+          if (channel.read(bb1) == -1) {
+            throw new EOFException("error reading first buffer");
+          }
+        }
+        ByteBuffer bb2 = buf2.asByteBuffer();
+        while (bb2.hasRemaining()) {
+          if (channel.read(bb2) == -1) {
+            throw new EOFException("error reading second buffer");
+          }
+        }
+      }
+      ParquetOptions opts = ParquetOptions.DEFAULT;
+      try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while(reader.hasNext()) {
+          ++numChunks;
+          try(Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(2, numChunks);
+        assertEquals(40000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testChunkedReadParquetFromDataSource() throws IOException {
     try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2f05101e8e3..ff6fba1c3e8 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -12,26 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources
-    column.pyx
-    copying.pyx
-    csv.pyx
-    groupby.pyx
-    interop.pyx
-    orc.pyx
-    parquet.pyx
-    reduce.pyx
-    round.pyx
-    scalar.pyx
-    sort.pyx
-    stream_compaction.pyx
-    string_casting.pyx
-    strings_udf.pyx
-    text.pyx
-    transform.pyx
-    types.pyx
-    utils.pyx
-)
+set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx)
 set(linked_libraries cudf::cudf)
 
 rapids_cython_create_modules(
@@ -41,12 +22,3 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
-target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
-
-include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
-target_link_libraries(interop PUBLIC nanoarrow)
-
-add_subdirectory(io)
-add_subdirectory(nvtext)
-add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.pxd b/python/cudf/cudf/_lib/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cb2d0501fea..11473d60698 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,25 +1,2 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import numpy as np
-
-from . import (
-    copying,
-    csv,
-    groupby,
-    interop,
-    nvtext,
-    orc,
-    parquet,
-    reduce,
-    round,
-    sort,
-    stream_compaction,
-    string_casting,
-    strings,
-    strings_udf,
-    text,
-)
-
-MAX_COLUMN_SIZE = np.iinfo(np.int32).max
-MAX_COLUMN_SIZE_STR = "INT32_MAX"
-MAX_STRING_COLUMN_BYTES = np.iinfo(np.int32).max
-MAX_STRING_COLUMN_BYTES_STR = "INT32_MAX"
+from . import strings_udf
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 245a5d03981..f7dcd89ea48 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -31,12 +31,12 @@ from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.types cimport (
     dtype_from_column_view,
-    dtype_to_data_type,
     dtype_to_pylibcudf_type,
 )
 
 from cudf._lib.types import dtype_from_pylibcudf_column
 
+from pylibcudf cimport DataType as plc_DataType
 cimport pylibcudf.libcudf.copying as cpp_copying
 cimport pylibcudf.libcudf.types as libcudf_types
 cimport pylibcudf.libcudf.unary as libcudf_unary
@@ -361,7 +361,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[mutable_column_view] children
         cdef void* data
@@ -398,7 +398,7 @@ cdef class Column:
         self._data = None
 
         return mutable_column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,
@@ -424,7 +424,7 @@ cdef class Column:
             col = self
             data_dtype = col.dtype
 
-        cdef libcudf_types.data_type dtype = dtype_to_data_type(data_dtype)
+        cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype)
         cdef libcudf_types.size_type offset = self.offset
         cdef vector[column_view] children
         cdef void* data
@@ -450,7 +450,7 @@ cdef class Column:
         cdef libcudf_types.size_type c_null_count = null_count
 
         return column_view(
-            dtype,
+            dtype.c_obj,
             self.size,
             data,
             mask,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
deleted file mode 100644
index 1f3f03f4be1..00000000000
--- a/python/cudf/cudf/_lib/copying.pyx
+++ /dev/null
@@ -1,456 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pickle
-
-from libcpp cimport bool
-import pylibcudf
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.core.abc import Serializable
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.scalar cimport DeviceScalar
-
-from cudf._lib.reduce import minmax
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_pylibcudf_table
-import pylibcudf as plc
-from pylibcudf.contiguous_split cimport PackedColumns as PlcPackedColumns
-
-
-def _gather_map_is_valid(
-    gather_map: "cudf.core.column.ColumnBase",
-    nrows: int,
-    check_bounds: bool,
-    nullify: bool,
-) -> bool:
-    """Returns true if gather map is valid.
-
-    A gather map is valid if empty or all indices are within the range
-    ``[-nrows, nrows)``, except when ``nullify`` is specified.
-    """
-    if not check_bounds or nullify or len(gather_map) == 0:
-        return True
-    gm_min, gm_max = minmax(gather_map)
-    return gm_min >= -nrows and gm_max < nrows
-
-
-@acquire_spill_lock()
-def copy_column(Column input_column):
-    """
-    Deep copies a column
-
-    Parameters
-    ----------
-    input_columns : column to be copied
-
-    Returns
-    -------
-    Deep copied column
-    """
-    return Column.from_pylibcudf(
-        input_column.to_pylibcudf(mode="read").copy()
-    )
-
-
-@acquire_spill_lock()
-def _copy_range_in_place(Column input_column,
-                         Column target_column,
-                         size_type input_begin,
-                         size_type input_end,
-                         size_type target_begin):
-    pylibcudf.copying.copy_range(
-        input_column.to_pylibcudf(mode="write"),
-        target_column.to_pylibcudf(mode="write"),
-        input_begin,
-        input_end,
-        target_begin
-    )
-
-
-def _copy_range(Column input_column,
-                Column target_column,
-                size_type input_begin,
-                size_type input_end,
-                size_type target_begin):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.copy_range(
-            input_column.to_pylibcudf(mode="read"),
-            target_column.to_pylibcudf(mode="read"),
-            input_begin,
-            input_end,
-            target_begin
-        )
-    )
-
-
-@acquire_spill_lock()
-def copy_range(Column source_column,
-               Column target_column,
-               size_type source_begin,
-               size_type source_end,
-               size_type target_begin,
-               size_type target_end,
-               bool inplace):
-    """
-    Copy a contiguous range from a source to a target column
-
-    Notes
-    -----
-    Expects the source and target ranges to have been sanitised to be
-    in-range for the source and target column respectively. For
-    example via ``slice.indices``.
-    """
-
-    msg = "Source and target ranges must be same length"
-    assert source_end - source_begin == target_end - target_begin, msg
-    if target_end >= target_begin and inplace:
-        # FIXME: Are we allowed to do this when inplace=False?
-        return target_column
-
-    if inplace:
-        _copy_range_in_place(source_column, target_column,
-                             source_begin, source_end, target_begin)
-    else:
-        return _copy_range(source_column, target_column,
-                           source_begin, source_end, target_begin)
-
-
-@acquire_spill_lock()
-def gather(
-    list columns,
-    Column gather_map,
-    bool nullify=False
-):
-    tbl = pylibcudf.copying.gather(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        gather_map.to_pylibcudf(mode="read"),
-        pylibcudf.copying.OutOfBoundsPolicy.NULLIFY if nullify
-        else pylibcudf.copying.OutOfBoundsPolicy.DONT_CHECK
-    )
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def scatter(list sources, Column scatter_map, list target_columns,
-            bool bounds_check=True):
-    """
-    Scattering source into target as per the scatter map.
-    `source` can be a list of scalars, or a list of columns. The number of
-    items in `sources` must equal the number of `target_columns` to scatter.
-    """
-    # TODO: Only single column scatter is used, we should explore multi-column
-    # scatter for frames for performance increase.
-
-    if len(sources) != len(target_columns):
-        raise ValueError("Mismatched number of source and target columns.")
-
-    if len(sources) == 0:
-        return []
-
-    if bounds_check:
-        n_rows = len(target_columns[0])
-        if not (
-            (scatter_map >= -n_rows).all()
-            and (scatter_map < n_rows).all()
-        ):
-            raise IndexError(
-                f"index out of bounds for column of size {n_rows}"
-            )
-
-    tbl = pylibcudf.copying.scatter(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in sources])
-        if isinstance(sources[0], Column)
-        else [(<DeviceScalar> as_device_scalar(slr)).c_value for slr in sources],
-        scatter_map.to_pylibcudf(mode="read"),
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
-    )
-
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def column_empty_like(Column input_column):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.empty_like(
-            input_column.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def column_allocate_like(Column input_column, size=None):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.allocate_like(
-            input_column.to_pylibcudf(mode="read"),
-            size,
-        )
-    )
-
-
-@acquire_spill_lock()
-def columns_empty_like(list input_columns):
-    return columns_from_pylibcudf_table(
-        pylibcudf.copying.empty_like(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns])
-        )
-    )
-
-
-@acquire_spill_lock()
-def column_slice(Column input_column, object indices):
-    return [
-        Column.from_pylibcudf(c)
-        for c in pylibcudf.copying.slice(
-            input_column.to_pylibcudf(mode="read"),
-            list(indices),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def columns_slice(list input_columns, object indices):
-    return [
-        columns_from_pylibcudf_table(tbl)
-        for tbl in pylibcudf.copying.slice(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]),
-            list(indices),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def column_split(Column input_column, object splits):
-    return [
-        Column.from_pylibcudf(c)
-        for c in pylibcudf.copying.split(
-            input_column.to_pylibcudf(mode="read"),
-            list(splits),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def columns_split(list input_columns, object splits):
-    return [
-        columns_from_pylibcudf_table(tbl)
-        for tbl in pylibcudf.copying.split(
-            pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_columns]),
-            list(splits),
-        )
-    ]
-
-
-@acquire_spill_lock()
-def copy_if_else(object lhs, object rhs, Column boolean_mask):
-    return Column.from_pylibcudf(
-        pylibcudf.copying.copy_if_else(
-            lhs.to_pylibcudf(mode="read") if isinstance(lhs, Column)
-            else (<DeviceScalar> as_device_scalar(lhs)).c_value,
-            rhs.to_pylibcudf(mode="read") if isinstance(rhs, Column)
-            else (<DeviceScalar> as_device_scalar(rhs)).c_value,
-            boolean_mask.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def boolean_mask_scatter(list input_, list target_columns,
-                         Column boolean_mask):
-    """Copy the target columns, replacing masked rows with input data.
-
-    The ``input_`` data can be a list of columns or as a list of scalars.
-    A list of input columns will be used to replace corresponding rows in the
-    target columns for which the boolean mask is ``True``. For the nth ``True``
-    in the boolean mask, the nth row in ``input_`` is used to replace. A list
-    of input scalars will replace all rows in the target columns for which the
-    boolean mask is ``True``.
-    """
-    if len(input_) != len(target_columns):
-        raise ValueError("Mismatched number of input and target columns.")
-
-    if len(input_) == 0:
-        return []
-
-    tbl = pylibcudf.copying.boolean_mask_scatter(
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in input_])
-        if isinstance(input_[0], Column)
-        else [(<DeviceScalar> as_device_scalar(i)).c_value for i in input_],
-        pylibcudf.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
-        boolean_mask.to_pylibcudf(mode="read"),
-    )
-
-    return columns_from_pylibcudf_table(tbl)
-
-
-@acquire_spill_lock()
-def shift(Column input, int offset, object fill_value=None):
-    cdef DeviceScalar fill
-
-    if isinstance(fill_value, DeviceScalar):
-        fill = fill_value
-    else:
-        fill = as_device_scalar(fill_value, input.dtype)
-
-    col = pylibcudf.copying.shift(
-        input.to_pylibcudf(mode="read"),
-        offset,
-        fill.c_value,
-    )
-    return Column.from_pylibcudf(col)
-
-
-@acquire_spill_lock()
-def get_element(Column input_column, size_type index):
-    return DeviceScalar.from_pylibcudf(
-        pylibcudf.copying.get_element(
-            input_column.to_pylibcudf(mode="read"),
-            index,
-        ),
-        dtype=input_column.dtype,
-    )
-
-
-class PackedColumns(Serializable):
-    """
-    A packed representation of a Frame, with all columns residing
-    in a single GPU memory buffer.
-    """
-
-    def __init__(
-        self,
-        PlcPackedColumns data,
-        object column_names = None,
-        object index_names = None,
-        object column_dtypes = None
-    ):
-        self._metadata, self._gpu_data = data.release()
-        self.column_names=column_names
-        self.index_names=index_names
-        self.column_dtypes=column_dtypes
-
-    def __reduce__(self):
-        return self.deserialize, self.serialize()
-
-    @property
-    def __cuda_array_interface__(self):
-        return self._gpu_data.__cuda_array_interface__
-
-    def serialize(self):
-        header = {}
-        frames = []
-        gpu_data = as_buffer(
-            data = self._gpu_data.obj.ptr,
-            size = self._gpu_data.obj.size,
-            owner=self,
-            exposed=True
-        )
-        data_header, data_frames = gpu_data.serialize()
-        header["data"] = data_header
-        frames.extend(data_frames)
-
-        header["column-names"] = self.column_names
-        header["index-names"] = self.index_names
-        header["metadata"] = self._metadata.tobytes()
-        for name, dtype in self.column_dtypes.items():
-            dtype_header, dtype_frames = dtype.serialize()
-            self.column_dtypes[name] = (
-                dtype_header,
-                (len(frames), len(frames) + len(dtype_frames)),
-            )
-            frames.extend(dtype_frames)
-        header["column-dtypes"] = self.column_dtypes
-        header["type-serialized"] = pickle.dumps(type(self))
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header, frames):
-        column_dtypes = {}
-        for name, dtype in header["column-dtypes"].items():
-            dtype_header, (start, stop) = dtype
-            column_dtypes[name] = pickle.loads(
-                dtype_header["type-serialized"]
-            ).deserialize(dtype_header, frames[start:stop])
-        return cls(
-            plc.contiguous_split.pack(
-                plc.contiguous_split.unpack_from_memoryviews(
-                    memoryview(header["metadata"]),
-                    plc.gpumemoryview(frames[0]),
-                )
-            ),
-            header["column-names"],
-            header["index-names"],
-            column_dtypes,
-        )
-
-    @classmethod
-    def from_py_table(cls, input_table, keep_index=True):
-        if keep_index and (
-            not isinstance(input_table.index, cudf.RangeIndex)
-            or input_table.index.start != 0
-            or input_table.index.stop != len(input_table)
-            or input_table.index.step != 1
-        ):
-            columns = input_table._index._columns + input_table._columns
-            index_names = input_table._index_names
-        else:
-            columns = input_table._columns
-            index_names = None
-
-        column_names = input_table._column_names
-        column_dtypes = {}
-        for name, col in input_table._column_labels_and_values:
-            if isinstance(
-                col.dtype,
-                (cudf.core.dtypes._BaseDtype, cudf.core.dtypes.CategoricalDtype)
-            ):
-                column_dtypes[name] = col.dtype
-
-        return cls(
-            plc.contiguous_split.pack(
-                plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read") for col in columns
-                    ]
-                )
-            ),
-            column_names,
-            index_names,
-            column_dtypes,
-        )
-
-    def unpack(self):
-        output_table = cudf.DataFrame._from_data(*data_from_pylibcudf_table(
-            plc.contiguous_split.unpack_from_memoryviews(
-                self._metadata,
-                self._gpu_data
-            ),
-            self.column_names,
-            self.index_names
-        ))
-        for name, dtype in self.column_dtypes.items():
-            output_table._data[name] = (
-                output_table._data[name]._with_type_metadata(dtype)
-            )
-
-        return output_table
-
-
-def pack(input_table, keep_index=True):
-    """
-    Pack the columns of a cudf Frame into a single GPU memory buffer.
-    """
-    return PackedColumns.from_py_table(input_table, keep_index)
-
-
-def unpack(packed):
-    """
-    Unpack the results of packing a cudf Frame returning a new
-    cudf Frame in the process.
-    """
-    return packed.unpack()
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
deleted file mode 100644
index 641fc18c203..00000000000
--- a/python/cudf/cudf/_lib/csv.pyx
+++ /dev/null
@@ -1,414 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-cimport pylibcudf.libcudf.types as libcudf_types
-
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import errno
-import os
-from collections import abc
-from io import BytesIO, StringIO
-
-import numpy as np
-import pandas as pd
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-from cudf.api.types import is_hashable
-
-from pylibcudf.types cimport DataType
-
-CSV_HEX_TYPE_MAP = {
-    "hex": np.dtype("int64"),
-    "hex64": np.dtype("int64"),
-    "hex32": np.dtype("int32")
-}
-
-
-def validate_args(
-    object delimiter,
-    object sep,
-    bool delim_whitespace,
-    object decimal,
-    object thousands,
-    object nrows,
-    int skipfooter,
-    object byte_range,
-    int skiprows
-):
-    if delim_whitespace:
-        if delimiter is not None:
-            raise ValueError("cannot set both delimiter and delim_whitespace")
-        if sep != ',':
-            raise ValueError("cannot set both sep and delim_whitespace")
-
-    # Alias sep -> delimiter.
-    actual_delimiter = delimiter if delimiter else sep
-
-    if decimal == actual_delimiter:
-        raise ValueError("decimal cannot be the same as delimiter")
-
-    if thousands == actual_delimiter:
-        raise ValueError("thousands cannot be the same as delimiter")
-
-    if nrows is not None and skipfooter != 0:
-        raise ValueError("cannot use both nrows and skipfooter parameters")
-
-    if byte_range is not None:
-        if skipfooter != 0 or skiprows != 0 or nrows is not None:
-            raise ValueError("""cannot manually limit rows to be read when
-                                using the byte range parameter""")
-
-
-def read_csv(
-    object datasource,
-    object lineterminator="\n",
-    object quotechar='"',
-    int quoting=0,
-    bool doublequote=True,
-    object header="infer",
-    bool mangle_dupe_cols=True,
-    object usecols=None,
-    object sep=",",
-    object delimiter=None,
-    bool delim_whitespace=False,
-    bool skipinitialspace=False,
-    object names=None,
-    object dtype=None,
-    int skipfooter=0,
-    int skiprows=0,
-    bool dayfirst=False,
-    object compression="infer",
-    object thousands=None,
-    object decimal=".",
-    object true_values=None,
-    object false_values=None,
-    object nrows=None,
-    object byte_range=None,
-    bool skip_blank_lines=True,
-    object parse_dates=None,
-    object comment=None,
-    object na_values=None,
-    bool keep_default_na=True,
-    bool na_filter=True,
-    object prefix=None,
-    object index_col=None,
-):
-    """
-    Cython function to call into libcudf API, see `read_csv`.
-
-    See Also
-    --------
-    cudf.read_csv
-    """
-
-    if not isinstance(datasource, (BytesIO, StringIO, bytes)):
-        if not os.path.isfile(datasource):
-            raise FileNotFoundError(
-                errno.ENOENT, os.strerror(errno.ENOENT), datasource
-            )
-
-    if isinstance(datasource, StringIO):
-        datasource = datasource.read().encode()
-    elif isinstance(datasource, str) and not os.path.isfile(datasource):
-        datasource = datasource.encode()
-
-    validate_args(delimiter, sep, delim_whitespace, decimal, thousands,
-                  nrows, skipfooter, byte_range, skiprows)
-
-    # Alias sep -> delimiter.
-    if delimiter is None:
-        delimiter = sep
-
-    delimiter = str(delimiter)
-
-    if byte_range is None:
-        byte_range = (0, 0)
-
-    if compression is None:
-        c_compression = plc.io.types.CompressionType.NONE
-    else:
-        compression_map = {
-            "infer": plc.io.types.CompressionType.AUTO,
-            "gzip": plc.io.types.CompressionType.GZIP,
-            "bz2": plc.io.types.CompressionType.BZIP2,
-            "zip": plc.io.types.CompressionType.ZIP,
-        }
-        c_compression = compression_map[compression]
-
-    # We need this later when setting index cols
-    orig_header = header
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            header = -1
-        else:
-            header = header
-        names = list(names)
-    else:
-        if header is None:
-            header = -1
-        elif header == 'infer':
-            header = 0
-
-    hex_cols = []
-
-    new_dtypes = []
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            new_dtypes = dict()
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    hex_cols.append(str(k))
-
-                new_dtypes[k] = _get_plc_data_type_from_dtype(
-                    cudf.dtype(col_type)
-                )
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                hex_cols.append(0)
-
-            new_dtypes.append(
-                _get_plc_data_type_from_dtype(dtype)
-            )
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    hex_cols.append(index)
-
-                new_dtypes.append(
-                    _get_plc_data_type_from_dtype(col_dtype)
-                )
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-    options = (
-        plc.io.csv.CsvReaderOptions.builder(plc.io.SourceInfo([datasource]))
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(byte_range[0])
-        .byte_range_size(byte_range[1])
-        .nrows(nrows if nrows is not None else -1)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(quoting)
-        .lineterminator(str(lineterminator))
-        .quotechar(quotechar)
-        .decimal(decimal)
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    options.set_header(header)
-
-    if names is not None:
-        options.set_names([str(name) for name in names])
-
-    if prefix is not None:
-        options.set_prefix(prefix)
-
-    if usecols is not None:
-        if all(isinstance(col, int) for col in usecols):
-            options.set_use_cols_indexes(list(usecols))
-        else:
-            options.set_use_cols_names([str(name) for name in usecols])
-
-    if delimiter is not None:
-        options.set_delimiter(delimiter)
-
-    if thousands is not None:
-        options.set_thousands(thousands)
-
-    if comment is not None:
-        options.set_comment(comment)
-
-    if parse_dates is not None:
-        options.set_parse_dates(list(parse_dates))
-
-    if hex_cols is not None:
-        options.set_parse_hex(list(hex_cols))
-
-    options.set_dtypes(new_dtypes)
-
-    if true_values is not None:
-        options.set_true_values([str(val) for val in true_values])
-
-    if false_values is not None:
-        options.set_false_values([str(val) for val in false_values])
-
-    if na_values is not None:
-        options.set_na_values([str(val) for val in na_values])
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(plc.io.csv.read_csv(options))
-    )
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
-                    df._data[str(k)] = df._data[str(k)].astype(v)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
-                df = df.astype(dtype)
-        elif isinstance(dtype, abc.Collection):
-            for index, col_dtype in enumerate(dtype):
-                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
-                    col_name = df._column_names[index]
-                    df._data[col_name] = df._data[col_name].astype(col_dtype)
-
-    if names is not None and len(names) and isinstance(names[0], int):
-        df.columns = [int(x) for x in df._data]
-    elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"):
-        df.columns = [int(x) for x in df._column_names]
-
-    # Set index if the index_col parameter is passed
-    if index_col is not None and index_col is not False:
-        if isinstance(index_col, int):
-            index_col_name = df._data.get_labels_by_index(index_col)[0]
-            df = df.set_index(index_col_name)
-            if isinstance(index_col_name, str) and \
-                    names is None and orig_header == "infer":
-                if index_col_name.startswith("Unnamed:"):
-                    # TODO: Try to upstream it to libcudf
-                    # csv reader in future
-                    df._index.name = None
-            elif names is None:
-                df._index.name = index_col
-        else:
-            df = df.set_index(index_col)
-
-    return df
-
-
-@acquire_spill_lock()
-def write_csv(
-    table,
-    object path_or_buf=None,
-    object sep=",",
-    object na_rep="",
-    bool header=True,
-    object lineterminator="\n",
-    int rows_per_chunk=8,
-    bool index=True,
-):
-    """
-    Cython function to call into libcudf API, see `write_csv`.
-
-    See Also
-    --------
-    cudf.to_csv
-    """
-    index_and_not_empty = index is True and table.index is not None
-    columns = [
-        col.to_pylibcudf(mode="read") for col in table.index._columns
-    ] if index_and_not_empty else []
-    columns.extend(col.to_pylibcudf(mode="read") for col in table._columns)
-    col_names = []
-    if header:
-        all_names = list(table.index.names) if index_and_not_empty else []
-        all_names.extend(
-            na_rep if name is None or pd.isnull(name)
-            else name for name in table._column_names
-        )
-        col_names = [
-            '""' if (name in (None, '') and len(all_names) == 1)
-            else (str(name) if name not in (None, '') else '')
-            for name in all_names
-        ]
-    try:
-        plc.io.csv.write_csv(
-            (
-                plc.io.csv.CsvWriterOptions.builder(
-                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
-                )
-                .names(col_names)
-                .na_rep(na_rep)
-                .include_header(header)
-                .rows_per_chunk(rows_per_chunk)
-                .line_terminator(str(lineterminator))
-                .inter_column_delimiter(str(sep))
-                .true_value("True")
-                .false_value("False")
-                .build()
-            )
-        )
-    except OverflowError:
-        raise OverflowError(
-            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
-            "Consider providing a smaller chunksize argument."
-        )
-
-
-cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
-    # TODO: Remove this work-around Dictionary types
-    # in libcudf are fully mapped to categorical columns:
-    # https://github.com/rapidsai/cudf/issues/3960
-    if isinstance(dtype, cudf.CategoricalDtype):
-        dtype = dtype.categories.dtype
-    elif dtype == "category":
-        dtype = "str"
-
-    if isinstance(dtype, str):
-        if str(dtype) == "date32":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_DAYS
-            )
-        elif str(dtype) in ("date", "date64"):
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[us]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MICROSECONDS
-            )
-        elif str(dtype) == "timestamp[s]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_SECONDS
-            )
-        elif str(dtype) == "timestamp[ms]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-            )
-        elif str(dtype) == "timestamp[ns]":
-            return DataType(
-                libcudf_types.type_id.TIMESTAMP_NANOSECONDS
-            )
-
-    dtype = cudf.dtype(dtype)
-    return dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
deleted file mode 100644
index 80a77ef2267..00000000000
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from functools import singledispatch
-
-from pandas.errors import DataError
-
-from cudf.api.types import _is_categorical_dtype, is_string_dtype
-from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import (
-    CategoricalDtype,
-    DecimalDtype,
-    IntervalDtype,
-    ListDtype,
-    StructDtype,
-)
-
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-from cudf._lib.scalar import as_device_scalar
-
-import pylibcudf
-
-from cudf.core._internals.aggregation import make_aggregation
-
-# The sets below define the possible aggregations that can be performed on
-# different dtypes. These strings must be elements of the AggregationKind enum.
-# The libcudf infrastructure exists for "COLLECT" support on
-# categoricals, but the dtype support in python does not.
-_CATEGORICAL_AGGS = {"COUNT", "NUNIQUE", "SIZE", "UNIQUE"}
-_STRING_AGGS = {
-    "COLLECT",
-    "COUNT",
-    "MAX",
-    "MIN",
-    "NTH",
-    "NUNIQUE",
-    "SIZE",
-    "UNIQUE",
-}
-_LIST_AGGS = {"COLLECT"}
-_STRUCT_AGGS = {"COLLECT", "CORRELATION", "COVARIANCE"}
-_INTERVAL_AGGS = {"COLLECT"}
-_DECIMAL_AGGS = {
-    "ARGMIN",
-    "ARGMAX",
-    "COLLECT",
-    "COUNT",
-    "MAX",
-    "MIN",
-    "NTH",
-    "NUNIQUE",
-    "SUM",
-}
-
-
-@singledispatch
-def get_valid_aggregation(dtype):
-    if is_string_dtype(dtype):
-        return _STRING_AGGS
-    return "ALL"
-
-
-@get_valid_aggregation.register
-def _(dtype: ListDtype):
-    return _LIST_AGGS
-
-
-@get_valid_aggregation.register
-def _(dtype: CategoricalDtype):
-    return _CATEGORICAL_AGGS
-
-
-@get_valid_aggregation.register
-def _(dtype: ListDtype):
-    return _LIST_AGGS
-
-
-@get_valid_aggregation.register
-def _(dtype: StructDtype):
-    return _STRUCT_AGGS
-
-
-@get_valid_aggregation.register
-def _(dtype: IntervalDtype):
-    return _INTERVAL_AGGS
-
-
-@get_valid_aggregation.register
-def _(dtype: DecimalDtype):
-    return _DECIMAL_AGGS
-
-
-cdef class GroupBy:
-    cdef dict __dict__
-
-    def __init__(self, keys, dropna=True):
-        with acquire_spill_lock() as spill_lock:
-            self._groupby = pylibcudf.groupby.GroupBy(
-                pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in keys]),
-                pylibcudf.types.NullPolicy.EXCLUDE if dropna
-                else pylibcudf.types.NullPolicy.INCLUDE
-            )
-
-            # We spill lock the columns while this GroupBy instance is alive.
-            self._spill_lock = spill_lock
-
-    def groups(self, list values):
-        """
-        Perform a sort groupby, using the keys used to construct the Groupby as the key
-        columns and ``values`` as the value columns.
-
-        Parameters
-        ----------
-        values: list of Columns
-            The value columns
-
-        Returns
-        -------
-        offsets: list of integers
-            Integer offsets such that offsets[i+1] - offsets[i]
-            represents the size of group `i`.
-        grouped_keys: list of Columns
-            The grouped key columns
-        grouped_values: list of Columns
-            The grouped value columns
-        """
-        offsets, grouped_keys, grouped_values = self._groupby.get_groups(
-            pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values])
-            if values else None
-        )
-
-        return (
-            offsets,
-            columns_from_pylibcudf_table(grouped_keys),
-            (
-                columns_from_pylibcudf_table(grouped_values)
-                if grouped_values is not None else []
-            ),
-        )
-
-    def aggregate(self, values, aggregations):
-        """
-        Parameters
-        ----------
-        values : Frame
-        aggregations
-            A dict mapping column names in `Frame` to a list of aggregations
-            to perform on that column
-
-            Each aggregation may be specified as:
-            - a string (e.g., "max")
-            - a lambda/function
-
-        Returns
-        -------
-        Frame of aggregated values
-        """
-        included_aggregations = []
-        column_included = []
-        requests = []
-        for i, (col, aggs) in enumerate(zip(values, aggregations)):
-            valid_aggregations = get_valid_aggregation(col.dtype)
-            included_aggregations_i = []
-            col_aggregations = []
-            for agg in aggs:
-                str_agg = str(agg)
-                if (
-                    is_string_dtype(col)
-                    and agg not in _STRING_AGGS
-                    and
-                    (
-                        str_agg in {"cumsum", "cummin", "cummax"}
-                        or not (
-                        any(a in str_agg for a in {
-                            "count",
-                            "max",
-                            "min",
-                            "first",
-                            "last",
-                            "nunique",
-                            "unique",
-                            "nth"
-                        })
-                        or (agg is list)
-                        )
-                    )
-                ):
-                    raise TypeError(
-                        f"function is not supported for this dtype: {agg}"
-                    )
-                elif (
-                    _is_categorical_dtype(col)
-                    and agg not in _CATEGORICAL_AGGS
-                    and (
-                        str_agg in {"cumsum", "cummin", "cummax"}
-                        or
-                        not (
-                            any(a in str_agg for a in {"count", "max", "min", "unique"})
-                        )
-                    )
-                ):
-                    raise TypeError(
-                        f"{col.dtype} type does not support {agg} operations"
-                    )
-
-                agg_obj = make_aggregation(agg)
-                if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
-                    included_aggregations_i.append((agg, agg_obj.kind))
-                    col_aggregations.append(agg_obj.c_obj)
-            included_aggregations.append(included_aggregations_i)
-            if col_aggregations:
-                requests.append(pylibcudf.groupby.GroupByRequest(
-                    col.to_pylibcudf(mode="read"), col_aggregations
-                ))
-                column_included.append(i)
-
-        if not requests and any(len(v) > 0 for v in aggregations):
-            raise DataError("All requested aggregations are unsupported.")
-
-        keys, results = self._groupby.scan(requests) if \
-            _is_all_scan_aggregate(aggregations) else self._groupby.aggregate(requests)
-
-        result_columns = [[] for _ in range(len(values))]
-        for i, result in zip(column_included, results):
-            result_columns[i] = columns_from_pylibcudf_table(result)
-
-        return result_columns, columns_from_pylibcudf_table(keys), included_aggregations
-
-    def shift(self, list values, int periods, list fill_values):
-        keys, shifts = self._groupby.shift(
-            pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]),
-            [periods] * len(values),
-            [
-                (<DeviceScalar> as_device_scalar(val, dtype=col.dtype)).c_value
-                for val, col in zip(fill_values, values)
-            ],
-        )
-
-        return columns_from_pylibcudf_table(shifts), columns_from_pylibcudf_table(keys)
-
-    def replace_nulls(self, list values, object method):
-        _, replaced = self._groupby.replace_nulls(
-            pylibcudf.table.Table([c.to_pylibcudf(mode="read") for c in values]),
-            [
-                pylibcudf.replace.ReplacePolicy.PRECEDING
-                if method == 'ffill' else pylibcudf.replace.ReplacePolicy.FOLLOWING
-            ] * len(values),
-        )
-
-        return columns_from_pylibcudf_table(replaced)
-
-
-_GROUPBY_SCANS = {"cumcount", "cumsum", "cummin", "cummax", "cumprod", "rank"}
-
-
-def _is_all_scan_aggregate(all_aggs):
-    """
-    Returns true if all are scan aggregations.
-    Raises
-    ------
-    NotImplementedError
-        If both reduction aggregations and scan aggregations are present.
-    """
-
-    def get_name(agg):
-        return agg.__name__ if callable(agg) else agg
-
-    all_scan = all(
-        get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs
-        for agg_name in aggs
-    )
-    any_scan = any(
-        get_name(agg_name) in _GROUPBY_SCANS for aggs in all_aggs
-        for agg_name in aggs
-    )
-
-    if not all_scan and any_scan:
-        raise NotImplementedError(
-            "Cannot perform both aggregation and scan in one operation"
-        )
-    return all_scan and any_scan
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
deleted file mode 100644
index 1c9d3a01b80..00000000000
--- a/python/cudf/cudf/_lib/interop.pyx
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf
-
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-from cudf.core.buffer import acquire_spill_lock
-from cudf.core.dtypes import ListDtype, StructDtype
-
-
-def from_dlpack(object dlpack_capsule):
-    """
-    Converts a DLPack Tensor PyCapsule into a list of columns.
-
-    DLPack Tensor PyCapsule is expected to have the name "dltensor".
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_dlpack(dlpack_capsule)
-    )
-
-
-def to_dlpack(list source_columns):
-    """
-    Converts a list of columns into a DLPack Tensor PyCapsule.
-
-    DLPack Tensor PyCapsule will have the name "dltensor".
-    """
-    return pylibcudf.interop.to_dlpack(
-        pylibcudf.Table(
-            [col.to_pylibcudf(mode="read") for col in source_columns]
-        )
-    )
-
-
-def gather_metadata(object cols_dtypes):
-    """
-    Generates a ColumnMetadata vector for each column.
-
-    Parameters
-    ----------
-    cols_dtypes : iterable
-        An iterable of ``(column_name, dtype)`` pairs.
-    """
-    cpp_metadata = []
-    if cols_dtypes is not None:
-        for idx, (col_name, col_dtype) in enumerate(cols_dtypes):
-            cpp_metadata.append(pylibcudf.interop.ColumnMetadata(col_name))
-            if isinstance(col_dtype, (ListDtype, StructDtype)):
-                _set_col_children_metadata(col_dtype, cpp_metadata[idx])
-    else:
-        raise TypeError(
-            "An iterable of (column_name, dtype) pairs is required to "
-            "construct column_metadata"
-        )
-    return cpp_metadata
-
-
-def _set_col_children_metadata(dtype, col_meta):
-    if isinstance(dtype, StructDtype):
-        for name, value in dtype.fields.items():
-            element_metadata = pylibcudf.interop.ColumnMetadata(name)
-            _set_col_children_metadata(value, element_metadata)
-            col_meta.children_meta.append(element_metadata)
-    elif isinstance(dtype, ListDtype):
-        # Offsets - child 0
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-        # Element column - child 1
-        element_metadata = pylibcudf.interop.ColumnMetadata()
-        _set_col_children_metadata(dtype.element_type, element_metadata)
-        col_meta.children_meta.append(element_metadata)
-    else:
-        col_meta.children_meta.append(pylibcudf.interop.ColumnMetadata())
-
-
-@acquire_spill_lock()
-def to_arrow(list source_columns, object column_dtypes):
-    """Convert a list of columns from
-    cudf Frame to a PyArrow Table.
-
-    Parameters
-    ----------
-    source_columns : a list of columns to convert
-    column_dtypes : Iterable of ``(column_name, column_dtype)`` pairs
-
-    Returns
-    -------
-    pyarrow table
-    """
-    cpp_metadata = gather_metadata(column_dtypes)
-    return pylibcudf.interop.to_arrow(
-        pylibcudf.Table([c.to_pylibcudf(mode="read") for c in source_columns]),
-        cpp_metadata,
-    )
-
-
-@acquire_spill_lock()
-def from_arrow(object input_table):
-    """Convert from PyArrow Table to a list of columns.
-
-    Parameters
-    ----------
-    input_table : PyArrow table
-
-    Returns
-    -------
-    A list of columns to construct Frame object
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.interop.from_arrow(input_table)
-    )
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
deleted file mode 100644
index e7408cf2852..00000000000
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources utils.pyx)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/io/__init__.pxd b/python/cudf/cudf/_lib/io/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/__init__.py b/python/cudf/cudf/_lib/io/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
deleted file mode 100644
index 96504ebdd66..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-    source_info,
-)
-
-from cudf._lib.column cimport Column
-
-
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & data) except*
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
-cdef add_df_col_struct_names(
-    df,
-    child_names_dict
-)
-cdef update_col_struct_field_names(
-    Column col,
-    child_names
-)
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info)
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
deleted file mode 100644
index f23980b387a..00000000000
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cpython.buffer cimport PyBUF_READ
-from cpython.memoryview cimport PyMemoryView_FromMemory
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.io.data_sink cimport data_sink
-from pylibcudf.libcudf.io.types cimport (
-    column_name_info,
-    sink_info,
-)
-
-from cudf._lib.column cimport Column
-
-import codecs
-import io
-import os
-
-from cudf.core.dtypes import StructDtype
-
-# Converts the Python sink input to libcudf IO sink_info.
-cdef sink_info make_sinks_info(
-    list src, vector[unique_ptr[data_sink]] & sink
-) except*:
-    cdef vector[data_sink *] data_sinks
-    cdef vector[string] paths
-    if isinstance(src[0], io.StringIO):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.TextIOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            # Files opened in text mode expect writes to be str rather than
-            # bytes, which requires conversion from utf-8. If the underlying
-            # buffer is utf-8, we can bypass this conversion by writing
-            # directly to it.
-            if codecs.lookup(s.encoding).name not in {"utf-8", "ascii"}:
-                raise NotImplementedError(f"Unsupported encoding {s.encoding}")
-            sink.push_back(
-                unique_ptr[data_sink](new iobase_data_sink(s.buffer))
-            )
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], io.IOBase):
-        data_sinks.reserve(len(src))
-        for s in src:
-            sink.push_back(unique_ptr[data_sink](new iobase_data_sink(s)))
-            data_sinks.push_back(sink.back().get())
-        return sink_info(data_sinks)
-    elif isinstance(src[0], (basestring, os.PathLike)):
-        paths.reserve(len(src))
-        for s in src:
-            paths.push_back(<string> os.path.expanduser(s).encode())
-        return sink_info(move(paths))
-    else:
-        raise TypeError("Unrecognized input type: {}".format(type(src)))
-
-
-cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
-    cdef vector[unique_ptr[data_sink]] datasinks
-    cdef sink_info info = make_sinks_info([src], datasinks)
-    if not datasinks.empty():
-        sink.swap(datasinks[0])
-    return info
-
-
-# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
-# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
-cdef cppclass iobase_data_sink(data_sink):
-    object buf
-
-    iobase_data_sink(object buf_):
-        this.buf = buf_
-
-    void host_write(const void * data, size_t size) with gil:
-        if isinstance(buf, io.StringIO):
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
-                      .tobytes().decode())
-        else:
-            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
-
-    void flush() with gil:
-        buf.flush()
-
-    size_t bytes_written() with gil:
-        return buf.tell()
-
-
-cdef add_df_col_struct_names(df, child_names_dict):
-    for name, child_names in child_names_dict.items():
-        col = df._data[name]
-
-        df._data[name] = update_col_struct_field_names(col, child_names)
-
-
-cdef update_col_struct_field_names(Column col, child_names):
-    if col.children:
-        children = list(col.children)
-        for i, (child, names) in enumerate(zip(children, child_names.values())):
-            children[i] = update_col_struct_field_names(
-                child,
-                names
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        col = col._rename_fields(
-            child_names.keys()
-        )
-
-    return col
-
-
-cdef update_struct_field_names(
-    table,
-    vector[column_name_info]& schema_info
-):
-    # Deprecated, remove in favor of add_col_struct_names
-    # when a reader is ported to pylibcudf
-    for i, (name, col) in enumerate(table._column_labels_and_values):
-        table._data[name] = update_column_struct_field_names(
-            col, schema_info[i]
-        )
-
-
-cdef Column update_column_struct_field_names(
-    Column col,
-    column_name_info& info
-):
-    cdef vector[string] field_names
-
-    if col.children:
-        children = list(col.children)
-        for i, child in enumerate(children):
-            children[i] = update_column_struct_field_names(
-                child,
-                info.children[i]
-            )
-        col.set_base_children(tuple(children))
-
-    if isinstance(col.dtype, StructDtype):
-        field_names.reserve(len(col.base_children))
-        for i in range(info.children.size()):
-            field_names.push_back(info.children[i].name)
-        col = col._rename_fields(
-            field_names
-        )
-
-    return col
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
deleted file mode 100644
index 22ec5d472f2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources
-    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
-    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
-)
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
deleted file mode 100644
index 2b2762eead2..00000000000
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs  # no-cython-lint
-
-
-@acquire_spill_lock()
-def byte_pair_encoding(
-    Column strings,
-    object merge_pairs,
-    object separator
-):
-    return Column.from_pylibcudf(
-        nvtext.byte_pair_encode.byte_pair_encoding(
-            strings.to_pylibcudf(mode="read"),
-            merge_pairs,
-            separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
deleted file mode 100644
index 3dd99c42d76..00000000000
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf cimport nvtext
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def edit_distance(Column strings, Column targets):
-    result = nvtext.edit_distance.edit_distance(
-        strings.to_pylibcudf(mode="read"),
-        targets.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def edit_distance_matrix(Column strings):
-    result = nvtext.edit_distance.edit_distance_matrix(
-        strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
deleted file mode 100644
index 7fdf9258b7f..00000000000
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def generate_ngrams(Column strings, int ngrams, object py_separator):
-    result = nvtext.generate_ngrams.generate_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams,
-        py_separator.device_value.c_value
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def generate_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.generate_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def hash_character_ngrams(Column strings, int ngrams):
-    result = nvtext.generate_ngrams.hash_character_ngrams(
-        strings.to_pylibcudf(mode="read"),
-        ngrams
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
deleted file mode 100644
index c964d0206b7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def jaccard_index(Column input1, Column input2, int width):
-    result = nvtext.jaccard.jaccard_index(
-        input1.to_pylibcudf(mode="read"),
-        input2.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
deleted file mode 100644
index 25cfcf99ca6..00000000000
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t, uint64_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def minhash(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash_permuted(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minhash64(Column input, Column seeds, int width=4):
-    result = nvtext.minhash.minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-        width,
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
-    return Column.from_pylibcudf(
-        nvtext.minhash.minhash64_permuted(
-            input.to_pylibcudf(mode="read"),
-            seed,
-            a.to_pylibcudf(mode="read"),
-            b.to_pylibcudf(mode="read"),
-            width,
-        )
-    )
-
-
-@acquire_spill_lock()
-def word_minhash(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
-
-
-@acquire_spill_lock()
-def word_minhash64(Column input, Column seeds):
-    result = nvtext.minhash.word_minhash64(
-        input.to_pylibcudf(mode="read"),
-        seeds.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
deleted file mode 100644
index c125d92a24e..00000000000
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def ngrams_tokenize(
-    Column input,
-    int ngrams,
-    object py_delimiter,
-    object py_separator
-):
-    return Column.from_pylibcudf(
-        nvtext.ngrams_tokenize.ngrams_tokenize(
-            input.to_pylibcudf(mode="read"),
-            ngrams,
-            py_delimiter.device_value.c_value,
-            py_separator.device_value.c_value
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
deleted file mode 100644
index cc45123dd0a..00000000000
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def normalize_spaces(Column input):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_spaces(
-            input.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def normalize_characters(Column input, bool do_lower=True):
-    return Column.from_pylibcudf(
-        nvtext.normalize.normalize_characters(
-            input.to_pylibcudf(mode="read"),
-            do_lower,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
deleted file mode 100644
index bec56ade83c..00000000000
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def replace_tokens(Column strings,
-                   Column targets,
-                   Column replacements,
-                   object py_delimiter):
-    """
-    The `targets` tokens are searched for within each `strings`
-    in the Column and replaced with the corresponding `replacements`
-    if found. Tokens are identified by the `py_delimiter` character
-    provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.replace_tokens(
-            strings.to_pylibcudf(mode="read"),
-            targets.to_pylibcudf(mode="read"),
-            replacements.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value,
-        )
-    )
-
-
-@acquire_spill_lock()
-def filter_tokens(Column strings,
-                  size_type min_token_length,
-                  object py_replacement,
-                  object py_delimiter):
-    """
-    Tokens smaller than `min_token_length` are removed from `strings`
-    in the Column and optionally replaced with the corresponding
-    `py_replacement` string. Tokens are identified by the `py_delimiter`
-    character provided.
-    """
-
-    return Column.from_pylibcudf(
-        nvtext.replace.filter_tokens(
-            strings.to_pylibcudf(mode="read"),
-            min_token_length,
-            py_replacement.device_value.c_value,
-            py_delimiter.device_value.c_value,
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
deleted file mode 100644
index 63a389b64d5..00000000000
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from enum import IntEnum
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.nvtext.stemmer cimport (
-    letter_type,
-    underlying_type_t_letter_type,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-class LetterType(IntEnum):
-    CONSONANT = <underlying_type_t_letter_type> letter_type.CONSONANT
-    VOWEL = <underlying_type_t_letter_type> letter_type.VOWEL
-
-
-@acquire_spill_lock()
-def porter_stemmer_measure(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.porter_stemmer_measure(
-            strings.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter(Column strings,
-              object ltype,
-              size_type index):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            index,
-        )
-    )
-
-
-@acquire_spill_lock()
-def is_letter_multi(Column strings,
-                    object ltype,
-                    Column indices):
-    return Column.from_pylibcudf(
-        nvtext.stemmer.is_letter(
-            strings.to_pylibcudf(mode="read"),
-            ltype==LetterType.VOWEL,
-            indices.to_pylibcudf(mode="read"),
-        )
-    )
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
deleted file mode 100644
index 5e0bfb74705..00000000000
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport uint32_t
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def subword_tokenize_inmem_hash(
-    Column strings,
-    object hashed_vocabulary,
-    uint32_t max_sequence_length=64,
-    uint32_t stride=48,
-    bool do_lower=True,
-    bool do_truncate=False,
-):
-    """
-    Subword tokenizes text series by using the pre-loaded hashed vocabulary
-    """
-    result = nvtext.subword_tokenize.subword_tokenize(
-        strings.to_pylibcudf(mode="read"),
-        hashed_vocabulary,
-        max_sequence_length,
-        stride,
-        do_lower,
-        do_truncate,
-    )
-    # return the 3 tensor components
-    tokens = Column.from_pylibcudf(result[0])
-    masks = Column.from_pylibcudf(result[1])
-    metadata = Column.from_pylibcudf(result[2])
-    return tokens, masks, metadata
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
deleted file mode 100644
index f473c48e2f7..00000000000
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from pylibcudf.nvtext.tokenize import TokenizeVocabulary  # no-cython-lint
-
-from cudf._lib.column cimport Column
-
-from pylibcudf import nvtext
-
-
-@acquire_spill_lock()
-def _tokenize_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _tokenize_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_scalar(Column strings, object py_delimiter):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_scalar(
-            strings.to_pylibcudf(mode="read"),
-            py_delimiter.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def _count_tokens_column(Column strings, Column delimiters):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.count_tokens_column(
-            strings.to_pylibcudf(mode="read"),
-            delimiters.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def character_tokenize(Column strings):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.character_tokenize(
-            strings.to_pylibcudf(mode="read")
-        )
-    )
-
-
-@acquire_spill_lock()
-def detokenize(Column strings, Column indices, object py_separator):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.detokenize(
-            strings.to_pylibcudf(mode="read"),
-            indices.to_pylibcudf(mode="read"),
-            py_separator.device_value.c_value
-        )
-    )
-
-
-@acquire_spill_lock()
-def tokenize_with_vocabulary(Column strings,
-                             object vocabulary,
-                             object py_delimiter,
-                             size_type default_id):
-    return Column.from_pylibcudf(
-        nvtext.tokenize.tokenize_with_vocabulary(
-            strings.to_pylibcudf(mode="read"),
-            vocabulary,
-            py_delimiter.device_value.c_value,
-            default_id
-        )
-    )
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
deleted file mode 100644
index c829cac6409..00000000000
--- a/python/cudf/cudf/_lib/orc.pyx
+++ /dev/null
@@ -1,466 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int64_t
-from libcpp cimport bool, int
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-import itertools
-from collections import OrderedDict
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport update_col_struct_field_names
-from cudf._lib.utils cimport data_from_pylibcudf_io
-
-import pylibcudf as plc
-
-import cudf
-from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-from cudf.core.buffer import acquire_spill_lock
-from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
-from pylibcudf.io.orc cimport OrcChunkedWriter
-
-# TODO: Consider inlining this function since it seems to only be used in one place.
-cpdef read_parsed_orc_statistics(filepath_or_buffer):
-    """
-    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
-
-    See Also
-    --------
-    cudf.io.orc.read_orc_statistics
-    """
-
-    parsed = (
-        plc.io.orc.read_parsed_orc_statistics(
-            plc.io.SourceInfo([filepath_or_buffer])
-        )
-    )
-
-    return parsed.column_names, parsed.file_stats, parsed.stripes_stats
-
-
-cpdef read_orc(object filepaths_or_buffers,
-               object columns=None,
-               object stripes=None,
-               object skip_rows=None,
-               object num_rows=None,
-               bool use_index=True,
-               object timestamp_type=None):
-    """
-    Cython function to call into libcudf API, see `read_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-
-    Notes
-    -----
-    Currently this function only considers the metadata of the first file in the list of
-    filepaths_or_buffers.
-    """
-
-    if columns is not None:
-        columns = [str(col) for col in columns]
-
-    tbl_w_meta = plc.io.orc.read_orc(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        stripes,
-        get_skiprows_arg(skip_rows),
-        get_num_rows_arg(num_rows),
-        use_index,
-        plc.types.DataType(
-            SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[
-                cudf.dtype(timestamp_type)
-            ]
-        )
-    )
-
-    names = tbl_w_meta.column_names(include_children=False)
-
-    actual_index_names, col_names, is_range_index, reset_index_name, \
-        range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data,
-                                             names,
-                                             skip_rows,
-                                             num_rows)
-
-    if columns is not None and (isinstance(columns, list) and len(columns) == 0):
-        # When `columns=[]`, index needs to be
-        # established, but not the columns.
-        nrows = tbl_w_meta.tbl.num_rows()
-        return {}, cudf.RangeIndex(nrows)
-
-    data, index = data_from_pylibcudf_io(
-        tbl_w_meta,
-        col_names if columns is None else names,
-        actual_index_names
-    )
-
-    if is_range_index:
-        index = range_idx
-    elif reset_index_name:
-        index.names = [None] * len(index.names)
-
-    child_name_values = tbl_w_meta.child_names.values()
-
-    data = {
-        name: update_col_struct_field_names(
-            col, child_names
-        )
-        for (name, col), child_names in zip(data.items(), child_name_values)
-    }
-
-    return data, index
-
-
-def _get_comp_type(object compression):
-    if compression is None or compression is False:
-        return plc.io.types.CompressionType.NONE
-
-    compression = str(compression).upper()
-    if compression == "SNAPPY":
-        return plc.io.types.CompressionType.SNAPPY
-    elif compression == "ZLIB":
-        return plc.io.types.CompressionType.ZLIB
-    elif compression == "ZSTD":
-        return plc.io.types.CompressionType.ZSTD
-    elif compression == "LZ4":
-        return plc.io.types.CompressionType.LZ4
-    else:
-        raise ValueError(f"Unsupported `compression` type {compression}")
-
-
-cdef tuple _get_index_from_metadata(
-        vector[map[string, string]] user_data,
-        object names,
-        object skip_rows,
-        object num_rows):
-
-    meta = None
-    index_col = None
-    is_range_index = False
-    reset_index_name = False
-    range_idx = None
-
-    if user_data.size() > 0:
-        json_str = user_data[0][b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            if 'index_columns' in meta and len(meta['index_columns']) > 0:
-                index_col = meta['index_columns']
-                if isinstance(index_col[0], dict) and \
-                        index_col[0]['kind'] == 'range':
-                    is_range_index = True
-                else:
-                    index_col_names = OrderedDict()
-                    for idx_col in index_col:
-                        for c in meta['columns']:
-                            if c['field_name'] == idx_col:
-                                index_col_names[idx_col] = \
-                                    c['name'] or c['field_name']
-                                if c['name'] is None:
-                                    reset_index_name = True
-
-    actual_index_names = None
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            range_index_meta = index_col[0]
-            range_idx = cudf.RangeIndex(
-                start=range_index_meta['start'],
-                stop=range_index_meta['stop'],
-                step=range_index_meta['step'],
-                name=range_index_meta['name']
-            )
-            if skip_rows is not None:
-                range_idx = range_idx[skip_rows:]
-            if num_rows is not None:
-                range_idx = range_idx[:num_rows]
-        else:
-            actual_index_names = list(index_col_names.values())
-            names = names[len(actual_index_names):]
-
-    return (
-        actual_index_names,
-        names,
-        is_range_index,
-        reset_index_name,
-        range_idx
-    )
-
-
-def _get_orc_stat_freq(str statistics):
-    """
-    Convert ORC statistics terms to CUDF convention:
-      - ORC "STRIPE"   == CUDF "ROWGROUP"
-      - ORC "ROWGROUP" == CUDF "PAGE"
-    """
-    statistics = str(statistics).upper()
-    if statistics == "NONE":
-        return plc.io.types.StatisticsFreq.STATISTICS_NONE
-    elif statistics == "STRIPE":
-        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
-    elif statistics == "ROWGROUP":
-        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
-    else:
-        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
-
-
-@acquire_spill_lock()
-def write_orc(
-    table,
-    object path_or_buf,
-    object compression="snappy",
-    str statistics="ROWGROUP",
-    object stripe_size_bytes=None,
-    object stripe_size_rows=None,
-    object row_index_stride=None,
-    object cols_as_map_type=None,
-    object index=None
-):
-    """
-    Cython function to call into libcudf API, see `cudf::io::write_orc`.
-
-    See Also
-    --------
-    cudf.read_orc
-    """
-    user_data = {}
-    user_data["pandas"] = generate_pandas_metadata(table, index)
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        columns = table._columns if table._index is None else [
-            *table.index._columns, *table._columns
-        ]
-        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        tbl_meta = TableInputMetadata(plc_table)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        plc_table = plc.Table(
-            [col.to_pylibcudf(mode="read") for col in table._columns]
-        )
-        tbl_meta = TableInputMetadata(plc_table)
-        num_index_cols_meta = 0
-
-    if cols_as_map_type is not None:
-        cols_as_map_type = set(cols_as_map_type)
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        tbl_meta.column_metadata[i].set_name(name)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            (cols_as_map_type is not None)
-            and (name in cols_as_map_type),
-        )
-
-    options = (
-        plc.io.orc.OrcWriterOptions.builder(
-            plc.io.SinkInfo([path_or_buf]), plc_table
-        )
-        .metadata(tbl_meta)
-        .key_value_metadata(user_data)
-        .compression(_get_comp_type(compression))
-        .enable_statistics(_get_orc_stat_freq(statistics))
-        .build()
-    )
-    if stripe_size_bytes is not None:
-        options.set_stripe_size_bytes(stripe_size_bytes)
-    if stripe_size_rows is not None:
-        options.set_stripe_size_rows(stripe_size_rows)
-    if row_index_stride is not None:
-        options.set_row_index_stride(row_index_stride)
-
-    plc.io.orc.write_orc(options)
-
-
-cdef int64_t get_skiprows_arg(object arg) except*:
-    arg = 0 if arg is None else arg
-    if not isinstance(arg, int) or arg < 0:
-        raise TypeError("skiprows must be an int >= 0")
-    return <int64_t> arg
-
-cdef int64_t get_num_rows_arg(object arg) except*:
-    arg = -1 if arg is None else arg
-    if not isinstance(arg, int) or arg < -1:
-        raise TypeError("num_rows must be an int >= -1")
-    return <int64_t> arg
-
-
-cdef class ORCWriter:
-    """
-    ORCWriter lets you you incrementally write out a ORC file from a series
-    of cudf tables
-
-    See Also
-    --------
-    cudf.io.orc.to_orc
-    """
-    cdef bool initialized
-    cdef OrcChunkedWriter writer
-    cdef SinkInfo sink
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef TableInputMetadata tbl_meta
-    cdef object cols_as_map_type
-    cdef object stripe_size_bytes
-    cdef object stripe_size_rows
-    cdef object row_index_stride
-
-    def __cinit__(self,
-                  object path,
-                  object index=None,
-                  object compression="snappy",
-                  str statistics="ROWGROUP",
-                  object cols_as_map_type=None,
-                  object stripe_size_bytes=None,
-                  object stripe_size_rows=None,
-                  object row_index_stride=None):
-        self.sink = plc.io.SinkInfo([path])
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.cols_as_map_type = cols_as_map_type \
-            if cols_as_map_type is None else set(cols_as_map_type)
-        self.stripe_size_bytes = stripe_size_bytes
-        self.stripe_size_rows = stripe_size_rows
-        self.row_index_stride = row_index_stride
-        self.initialized = False
-
-    def write_table(self, table):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(table)
-
-        keep_index = self.index is not False and (
-            table._index.name is not None or
-            isinstance(table._index, cudf.core.multiindex.MultiIndex)
-        )
-        if keep_index:
-            columns = [
-                col.to_pylibcudf(mode="read")
-                for col in itertools.chain(table.index._columns, table._columns)
-            ]
-        else:
-            columns = [col.to_pylibcudf(mode="read") for col in table._columns]
-
-        self.writer.write(plc.Table(columns))
-
-    def close(self):
-        if not self.initialized:
-            return
-
-        self.writer.close()
-
-    def __dealloc__(self):
-        self.close()
-
-    def _initialize_chunked_state(self, table):
-        """
-        Prepare all the values required to build the
-        chunked_orc_writer_options anb creates a writer"""
-
-        num_index_cols_meta = 0
-        plc_table = plc.Table(
-            [
-                col.to_pylibcudf(mode="read")
-                for col in table._columns
-            ]
-        )
-        self.tbl_meta = TableInputMetadata(plc_table)
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                plc_table = plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read")
-                        for col in itertools.chain(table.index._columns, table._columns)
-                    ]
-                )
-                self.tbl_meta = TableInputMetadata(plc_table)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(
-                        idx_name
-                    )
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    plc_table = plc.Table(
-                        [
-                            col.to_pylibcudf(mode="read")
-                            for col in itertools.chain(
-                                table.index._columns, table._columns
-                            )
-                        ]
-                    )
-                    self.tbl_meta = TableInputMetadata(plc_table)
-                    self.tbl_meta.column_metadata[0].set_name(
-                        table._index.name
-                    )
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name)
-            _set_col_children_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-                (self.cols_as_map_type is not None)
-                and (name in self.cols_as_map_type),
-            )
-
-        user_data = {}
-        pandas_metadata = generate_pandas_metadata(table, self.index)
-        user_data["pandas"] = pandas_metadata
-
-        options = (
-            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
-            .metadata(self.tbl_meta)
-            .key_value_metadata(user_data)
-            .compression(_get_comp_type(self.compression))
-            .enable_statistics(_get_orc_stat_freq(self.statistics))
-            .build()
-        )
-        if self.stripe_size_bytes is not None:
-            options.set_stripe_size_bytes(self.stripe_size_bytes)
-        if self.stripe_size_rows is not None:
-            options.set_stripe_size_rows(self.stripe_size_rows)
-        if self.row_index_stride is not None:
-            options.set_row_index_stride(self.row_index_stride)
-
-        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
-
-        self.initialized = True
-
-cdef _set_col_children_metadata(Column col,
-                                ColumnInMetadata col_meta,
-                                list_column_as_map=False):
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name)
-            _set_col_children_metadata(
-                child_col, col_meta.child(i), list_column_as_map
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if list_column_as_map:
-            col_meta.set_list_column_as_map()
-        _set_col_children_metadata(
-            col.children[cpp_lists_column_view.child_column_index],
-            col_meta.child(cpp_lists_column_view.child_column_index),
-            list_column_as_map
-        )
-    else:
-        return
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
deleted file mode 100644
index c77c9875342..00000000000
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ /dev/null
@@ -1,843 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-import io
-
-import pyarrow as pa
-import itertools
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-import numpy as np
-
-from cudf.api.types import is_list_like
-
-from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
-
-from cudf._lib.utils import _index_level_name, generate_pandas_metadata
-
-from libc.stdint cimport int64_t, uint8_t
-from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.expressions cimport Expression
-from pylibcudf.io.parquet cimport ChunkedParquetReader
-from pylibcudf.libcudf.io.types cimport (
-    statistics_freq,
-    compression_type,
-    dictionary_policy,
-)
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    add_df_col_struct_names,
-)
-
-import pylibcudf as plc
-
-from pylibcudf cimport Table
-
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
-from pylibcudf.io.types cimport TableInputMetadata, SinkInfo, ColumnInMetadata
-from pylibcudf.io.parquet cimport ParquetChunkedWriter
-from cython.operator cimport dereference
-
-
-cdef class BufferArrayFromVector:
-    cdef Py_ssize_t length
-    cdef unique_ptr[vector[uint8_t]] in_vec
-
-    # these two things declare part of the buffer interface
-    cdef Py_ssize_t shape[1]
-    cdef Py_ssize_t strides[1]
-
-    @staticmethod
-    cdef BufferArrayFromVector from_unique_ptr(
-        unique_ptr[vector[uint8_t]] in_vec
-    ):
-        cdef BufferArrayFromVector buf = BufferArrayFromVector()
-        buf.in_vec = move(in_vec)
-        buf.length = dereference(buf.in_vec).size()
-        return buf
-
-    def __getbuffer__(self, Py_buffer *buffer, int flags):
-        cdef Py_ssize_t itemsize = sizeof(uint8_t)
-
-        self.shape[0] = self.length
-        self.strides[0] = 1
-
-        buffer.buf = dereference(self.in_vec).data()
-
-        buffer.format = NULL  # byte
-        buffer.internal = NULL
-        buffer.itemsize = itemsize
-        buffer.len = self.length * itemsize   # product(shape) * itemsize
-        buffer.ndim = 1
-        buffer.obj = self
-        buffer.readonly = 0
-        buffer.shape = self.shape
-        buffer.strides = self.strides
-        buffer.suboffsets = NULL
-
-    def __releasebuffer__(self, Py_buffer *buffer):
-        pass
-
-
-def _parse_metadata(meta):
-    file_is_range_index = False
-    file_index_cols = None
-    file_column_dtype = None
-
-    if 'index_columns' in meta and len(meta['index_columns']) > 0:
-        file_index_cols = meta['index_columns']
-
-        if isinstance(file_index_cols[0], dict) and \
-                file_index_cols[0]['kind'] == 'range':
-            file_is_range_index = True
-    if 'column_indexes' in meta and len(meta['column_indexes']) == 1:
-        file_column_dtype = meta['column_indexes'][0]["numpy_type"]
-    return file_is_range_index, file_index_cols, file_column_dtype
-
-
-cdef object _process_metadata(object df,
-                              list names,
-                              dict child_names,
-                              list per_file_user_data,
-                              object row_groups,
-                              object filepaths_or_buffers,
-                              bool allow_range_index,
-                              bool use_pandas_metadata,
-                              size_type nrows=-1,
-                              int64_t skip_rows=0,
-                              ):
-
-    add_df_col_struct_names(df, child_names)
-    index_col = None
-    is_range_index = True
-    column_index_type = None
-    index_col_names = None
-    meta = None
-    for single_file in per_file_user_data:
-        if b'pandas' not in single_file:
-            continue
-        json_str = single_file[b'pandas'].decode('utf-8')
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-        is_range_index &= file_is_range_index
-
-        if not file_is_range_index and index_col is not None \
-                and index_col_names is None:
-            index_col_names = {}
-            for idx_col in index_col:
-                for c in meta['columns']:
-                    if c['field_name'] == idx_col:
-                        index_col_names[idx_col] = c['name']
-
-    if meta is not None:
-        # Book keep each column metadata as the order
-        # of `meta["columns"]` and `column_names` are not
-        # guaranteed to be deterministic and same always.
-        meta_data_per_column = {
-            col_meta['name']: col_meta for col_meta in meta["columns"]
-        }
-
-        # update the decimal precision of each column
-        for col in names:
-            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
-                df._data[col].dtype.precision = (
-                    meta_data_per_column[col]["metadata"]["precision"]
-                )
-
-    # Set the index column
-    if index_col is not None and len(index_col) > 0:
-        if is_range_index:
-            if not allow_range_index:
-                return df
-
-            if len(per_file_user_data) > 1:
-                range_index_meta = {
-                    "kind": "range",
-                    "name": None,
-                    "start": 0,
-                    "stop": len(df),
-                    "step": 1
-                }
-            else:
-                range_index_meta = index_col[0]
-
-            if row_groups is not None:
-                per_file_metadata = [
-                    pa.parquet.read_metadata(
-                        # Pyarrow cannot read directly from bytes
-                        io.BytesIO(s) if isinstance(s, bytes) else s
-                    ) for s in filepaths_or_buffers
-                ]
-
-                filtered_idx = []
-                for i, file_meta in enumerate(per_file_metadata):
-                    row_groups_i = []
-                    start = 0
-                    for row_group in range(file_meta.num_row_groups):
-                        stop = start + file_meta.row_group(row_group).num_rows
-                        row_groups_i.append((start, stop))
-                        start = stop
-
-                    for rg in row_groups[i]:
-                        filtered_idx.append(
-                            cudf.RangeIndex(
-                                start=row_groups_i[rg][0],
-                                stop=row_groups_i[rg][1],
-                                step=range_index_meta['step']
-                            )
-                        )
-
-                if len(filtered_idx) > 0:
-                    idx = cudf.concat(filtered_idx)
-                else:
-                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
-            else:
-                start = range_index_meta["start"] + skip_rows
-                stop = range_index_meta["stop"]
-                if nrows != -1:
-                    stop = start + nrows
-                idx = cudf.RangeIndex(
-                    start=start,
-                    stop=stop,
-                    step=range_index_meta['step'],
-                    name=range_index_meta['name']
-                )
-
-            df._index = idx
-        elif set(index_col).issubset(names):
-            index_data = df[index_col]
-            actual_index_names = iter(index_col_names.values())
-            if index_data._num_columns == 1:
-                idx = cudf.Index._from_column(
-                    index_data._columns[0],
-                    name=next(actual_index_names)
-                )
-            else:
-                idx = cudf.MultiIndex.from_frame(
-                    index_data,
-                    names=list(actual_index_names)
-                )
-            df.drop(columns=index_col, inplace=True)
-            df._index = idx
-        else:
-            if use_pandas_metadata:
-                df.index.names = index_col
-
-    if df._num_columns == 0 and column_index_type is not None:
-        df._data.label_dtype = cudf.dtype(column_index_type)
-
-    return df
-
-
-def read_parquet_chunked(
-    filepaths_or_buffers,
-    columns=None,
-    row_groups=None,
-    use_pandas_metadata=True,
-    size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000,
-    size_type nrows=-1,
-    int64_t skip_rows=0,
-    allow_mismatched_pq_schemas=False
-):
-    # Note: If this function ever takes accepts filters
-    # allow_range_index needs to be False when a filter is passed
-    # (see read_parquet)
-    allow_range_index = columns is not None and len(columns) != 0
-
-    reader = ChunkedParquetReader(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        skip_rows=skip_rows,
-        nrows=nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-    )
-
-    tbl_w_meta = reader.read_chunk()
-    column_names = tbl_w_meta.column_names(include_children=False)
-    child_names = tbl_w_meta.child_names
-    per_file_user_data = tbl_w_meta.per_file_user_data
-    concatenated_columns = tbl_w_meta.tbl.columns()
-
-    # save memory
-    del tbl_w_meta
-
-    cdef Table tbl
-    while reader.has_next():
-        tbl = reader.read_chunk().tbl
-
-        for i in range(tbl.num_columns()):
-            concatenated_columns[i] = plc.concatenate.concatenate(
-                [concatenated_columns[i], tbl._columns[i]]
-            )
-            # Drop residual columns to save memory
-            tbl._columns[i] = None
-
-    df = cudf.DataFrame._from_data(
-        *_data_from_columns(
-            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
-            column_names=column_names,
-            index_names=None
-        )
-    )
-    df = _process_metadata(df, column_names, child_names,
-                           per_file_user_data, row_groups,
-                           filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-
-cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   use_pandas_metadata=True,
-                   Expression filters=None,
-                   size_type nrows=-1,
-                   int64_t skip_rows=0,
-                   allow_mismatched_pq_schemas=False):
-    """
-    Cython function to call into libcudf API, see `read_parquet`.
-
-    filters, if not None, should be an Expression that evaluates to a
-    boolean predicate as a function of columns being read.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-
-    allow_range_index = True
-    if columns is not None and len(columns) == 0 or filters:
-        allow_range_index = False
-
-    # Read Parquet
-
-    tbl_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories = False,
-        use_pandas_metadata = use_pandas_metadata,
-        skip_rows = skip_rows,
-        nrows = nrows,
-        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-    )
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(tbl_w_meta)
-    )
-
-    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
-                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
-                           row_groups, filepaths_or_buffers,
-                           allow_range_index, use_pandas_metadata,
-                           nrows=nrows, skip_rows=skip_rows)
-    return df
-
-cpdef read_parquet_metadata(list filepaths_or_buffers):
-    """
-    Cython function to call into libcudf API, see `read_parquet_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
-        plc.io.SourceInfo(filepaths_or_buffers)
-    )
-
-    # read all column names including index column, if any
-    col_names = [info.name() for info in parquet_metadata.schema().root().children()]
-
-    index_col_names = set()
-    json_str = parquet_metadata.metadata()['pandas']
-    if json_str != "":
-        meta = json.loads(json_str)
-        file_is_range_index, index_col, _ = _parse_metadata(meta)
-        if (
-            not file_is_range_index
-            and index_col is not None
-        ):
-            columns = meta['columns']
-            for idx_col in index_col:
-                for c in columns:
-                    if c['field_name'] == idx_col:
-                        index_col_names.add(idx_col)
-
-    # remove the index column from the list of column names
-    # only if index_col_names is not None
-    if len(index_col_names) >= 0:
-        col_names = [name for name in col_names if name not in index_col_names]
-
-    return (
-        parquet_metadata.num_rows(),
-        parquet_metadata.num_rowgroups(),
-        col_names,
-        len(col_names),
-        parquet_metadata.rowgroup_metadata()
-    )
-
-
-@acquire_spill_lock()
-def write_parquet(
-    table,
-    object filepaths_or_buffers,
-    object index=None,
-    object compression="snappy",
-    object statistics="ROWGROUP",
-    object metadata_file_path=None,
-    object int96_timestamps=False,
-    object row_group_size_bytes=None,
-    object row_group_size_rows=None,
-    object max_page_size_bytes=None,
-    object max_page_size_rows=None,
-    object max_dictionary_size=None,
-    object partitions_info=None,
-    object force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-    write_arrow_schema=False,
-):
-    """
-    Cython function to call into libcudf API, see `write_parquet`.
-
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-    if index is True or (
-        index is None and not isinstance(table._index, cudf.RangeIndex)
-    ):
-        columns = [*table.index._columns, *table._columns]
-        plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        tbl_meta = TableInputMetadata(plc_table)
-        for level, idx_name in enumerate(table._index.names):
-            tbl_meta.column_metadata[level].set_name(
-                _index_level_name(idx_name, level, table._column_names)
-            )
-        num_index_cols_meta = len(table._index.names)
-    else:
-        plc_table = plc.Table(
-            [col.to_pylibcudf(mode="read") for col in table._columns]
-        )
-        tbl_meta = TableInputMetadata(plc_table)
-        num_index_cols_meta = 0
-
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        if not isinstance(name, str):
-            if cudf.get_option("mode.pandas_compatible"):
-                tbl_meta.column_metadata[i].set_name(str(name))
-            else:
-                raise ValueError(
-                    "Writing a Parquet file requires string column names"
-                )
-        else:
-            tbl_meta.column_metadata[i].set_name(name)
-
-        _set_col_metadata(
-            table[name]._column,
-            tbl_meta.column_metadata[i],
-            force_nullable_schema,
-            None,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-    if partitions_info is not None:
-        user_data = [
-            {"pandas": generate_pandas_metadata(
-                table.iloc[start_row:start_row + num_row].copy(deep=False),
-                index
-            )}
-            for start_row, num_row in partitions_info
-        ]
-    else:
-        user_data = [{"pandas": generate_pandas_metadata(table, index)}]
-
-    if header_version not in ("1.0", "2.0"):
-        raise ValueError(
-            f"Invalid parquet header version: {header_version}. "
-            "Valid values are '1.0' and '2.0'"
-        )
-
-    dict_policy = (
-        plc.io.types.DictionaryPolicy.ADAPTIVE
-        if use_dictionary
-        else plc.io.types.DictionaryPolicy.NEVER
-    )
-
-    comp_type = _get_comp_type(compression)
-    stat_freq = _get_stat_freq(statistics)
-    options = (
-        plc.io.parquet.ParquetWriterOptions.builder(
-            plc.io.SinkInfo(filepaths_or_buffers), plc_table
-        )
-        .metadata(tbl_meta)
-        .key_value_metadata(user_data)
-        .compression(comp_type)
-        .stats_level(stat_freq)
-        .int96_timestamps(int96_timestamps)
-        .write_v2_headers(header_version == "2.0")
-        .dictionary_policy(dict_policy)
-        .utc_timestamps(False)
-        .write_arrow_schema(write_arrow_schema)
-        .build()
-    )
-    if partitions_info is not None:
-        options.set_partitions(
-            [plc.io.types.PartitionInfo(part[0], part[1]) for part in partitions_info]
-        )
-    if metadata_file_path is not None:
-        if is_list_like(metadata_file_path):
-            options.set_column_chunks_file_paths(metadata_file_path)
-        else:
-            options.set_column_chunks_file_paths([metadata_file_path])
-    if row_group_size_bytes is not None:
-        options.set_row_group_size_bytes(row_group_size_bytes)
-    if row_group_size_rows is not None:
-        options.set_row_group_size_rows(row_group_size_rows)
-    if max_page_size_bytes is not None:
-        options.set_max_page_size_bytes(max_page_size_bytes)
-    if max_page_size_rows is not None:
-        options.set_max_page_size_rows(max_page_size_rows)
-    if max_dictionary_size is not None:
-        options.set_max_dictionary_size(max_dictionary_size)
-    blob = plc.io.parquet.write_parquet(options)
-    if metadata_file_path is not None:
-        return np.asarray(blob.obj)
-    else:
-        return None
-
-
-cdef class ParquetWriter:
-    """
-    ParquetWriter lets you incrementally write out a Parquet file from a series
-    of cudf tables
-
-    Parameters
-    ----------
-    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
-        File path or buffer to write to. The argument may also correspond
-        to a list of file paths or buffers.
-    index : bool or None, default None
-        If ``True``, include a dataframe's index(es) in the file output.
-        If ``False``, they will not be written to the file. If ``None``,
-        index(es) other than RangeIndex will be saved as columns.
-    compression : {'snappy', None}, default 'snappy'
-        Name of the compression to use. Use ``None`` for no compression.
-    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
-        Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default ``uint64 max``
-        Maximum size of each stripe of the output.
-        By default, a virtually infinite size equal to ``uint64 max`` will be used.
-    row_group_size_rows: int, default 1000000
-        Maximum number of rows of each stripe of the output.
-        By default, 1000000 (10^6 rows) will be used.
-    max_page_size_bytes: int, default 524288
-        Maximum uncompressed size of each page of the output.
-        By default, 524288 (512KB) will be used.
-    max_page_size_rows: int, default 20000
-        Maximum number of rows of each page of the output.
-        By default, 20000 will be used.
-    max_dictionary_size: int, default 1048576
-        Maximum size of the dictionary page for each output column chunk. Dictionary
-        encoding for column chunks that exceeds this limit will be disabled.
-        By default, 1048576 (1MB) will be used.
-    use_dictionary : bool, default True
-        If ``True``, enable dictionary encoding for Parquet page data
-        subject to ``max_dictionary_size`` constraints.
-        If ``False``, disable dictionary encoding for Parquet page data.
-    store_schema : bool, default False
-        If ``True``, enable computing and writing arrow schema to Parquet
-        file footer's key-value metadata section for faithful round-tripping.
-    See Also
-    --------
-    cudf.io.parquet.write_parquet
-    """
-    cdef bool initialized
-    cdef ParquetChunkedWriter writer
-    cdef SinkInfo sink
-    cdef TableInputMetadata tbl_meta
-    cdef str statistics
-    cdef object compression
-    cdef object index
-    cdef size_t row_group_size_bytes
-    cdef size_type row_group_size_rows
-    cdef size_t max_page_size_bytes
-    cdef size_type max_page_size_rows
-    cdef size_t max_dictionary_size
-    cdef bool use_dictionary
-    cdef bool write_arrow_schema
-
-    def __cinit__(self, object filepath_or_buffer, object index=None,
-                  object compression="snappy", str statistics="ROWGROUP",
-                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  size_type row_group_size_rows=1000000,
-                  size_t max_page_size_bytes=524288,
-                  size_type max_page_size_rows=20000,
-                  size_t max_dictionary_size=1048576,
-                  bool use_dictionary=True,
-                  bool store_schema=False):
-        filepaths_or_buffers = (
-            list(filepath_or_buffer)
-            if is_list_like(filepath_or_buffer)
-            else [filepath_or_buffer]
-        )
-        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
-        self.statistics = statistics
-        self.compression = compression
-        self.index = index
-        self.initialized = False
-        self.row_group_size_bytes = row_group_size_bytes
-        self.row_group_size_rows = row_group_size_rows
-        self.max_page_size_bytes = max_page_size_bytes
-        self.max_page_size_rows = max_page_size_rows
-        self.max_dictionary_size = max_dictionary_size
-        self.use_dictionary = use_dictionary
-        self.write_arrow_schema = store_schema
-
-    def write_table(self, table, object partitions_info=None):
-        """ Writes a single table to the file """
-        if not self.initialized:
-            self._initialize_chunked_state(
-                table,
-                num_partitions=len(partitions_info) if partitions_info else 1
-            )
-        if self.index is not False and (
-            table._index.name is not None or
-                isinstance(table._index, cudf.core.multiindex.MultiIndex)):
-            columns = [*table.index._columns, *table._columns]
-            plc_table = plc.Table([col.to_pylibcudf(mode="read") for col in columns])
-        else:
-            plc_table = plc.Table(
-                [col.to_pylibcudf(mode="read") for col in table._columns]
-            )
-        self.writer.write(plc_table, partitions_info)
-
-    def close(self, object metadata_file_path=None):
-        if not self.initialized:
-            return None
-        column_chunks_file_paths=[]
-        if metadata_file_path is not None:
-            if is_list_like(metadata_file_path):
-                column_chunks_file_paths = list(metadata_file_path)
-            else:
-                column_chunks_file_paths = [metadata_file_path]
-        blob = self.writer.close(column_chunks_file_paths)
-        if metadata_file_path is not None:
-            return np.asarray(blob.obj)
-        return None
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, *args):
-        self.close()
-
-    def _initialize_chunked_state(self, table, num_partitions=1):
-        """ Prepares all the values required to build the
-        chunked_parquet_writer_options and creates a writer"""
-
-        # Set the table_metadata
-        num_index_cols_meta = 0
-        plc_table = plc.Table(
-            [
-                col.to_pylibcudf(mode="read")
-                for col in table._columns
-            ]
-        )
-        self.tbl_meta = TableInputMetadata(plc_table)
-        if self.index is not False:
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
-                plc_table = plc.Table(
-                    [
-                        col.to_pylibcudf(mode="read")
-                        for col in itertools.chain(table.index._columns, table._columns)
-                    ]
-                )
-                self.tbl_meta = TableInputMetadata(plc_table)
-                for level, idx_name in enumerate(table._index.names):
-                    self.tbl_meta.column_metadata[level].set_name(idx_name)
-                num_index_cols_meta = len(table._index.names)
-            else:
-                if table._index.name is not None:
-                    plc_table = plc.Table(
-                        [
-                            col.to_pylibcudf(mode="read")
-                            for col in itertools.chain(
-                                table.index._columns, table._columns
-                            )
-                        ]
-                    )
-                    self.tbl_meta = TableInputMetadata(plc_table)
-                    self.tbl_meta.column_metadata[0].set_name(table._index.name)
-                    num_index_cols_meta = 1
-
-        for i, name in enumerate(table._column_names, num_index_cols_meta):
-            self.tbl_meta.column_metadata[i].set_name(name)
-            _set_col_metadata(
-                table[name]._column,
-                self.tbl_meta.column_metadata[i],
-            )
-
-        index = (
-            False if isinstance(table._index, cudf.RangeIndex) else self.index
-        )
-        user_data = [{"pandas" : generate_pandas_metadata(table, index)}]*num_partitions
-        cdef compression_type comp_type = _get_comp_type(self.compression)
-        cdef statistics_freq stat_freq = _get_stat_freq(self.statistics)
-        cdef dictionary_policy dict_policy = (
-            plc.io.types.DictionaryPolicy.ADAPTIVE
-            if self.use_dictionary
-            else plc.io.types.DictionaryPolicy.NEVER
-        )
-        options = (
-            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
-            .metadata(self.tbl_meta)
-            .key_value_metadata(user_data)
-            .compression(comp_type)
-            .stats_level(stat_freq)
-            .row_group_size_bytes(self.row_group_size_bytes)
-            .row_group_size_rows(self.row_group_size_rows)
-            .max_page_size_bytes(self.max_page_size_bytes)
-            .max_page_size_rows(self.max_page_size_rows)
-            .max_dictionary_size(self.max_dictionary_size)
-            .write_arrow_schema(self.write_arrow_schema)
-            .build()
-        )
-        options.set_dictionary_policy(dict_policy)
-        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
-        self.initialized = True
-
-
-cpdef merge_filemetadata(object filemetadata_list):
-    """
-    Cython function to call into libcudf API, see `merge_row_group_metadata`.
-
-    See Also
-    --------
-    cudf.io.parquet.merge_row_group_metadata
-    """
-    return np.asarray(
-        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
-    )
-
-
-cdef statistics_freq _get_stat_freq(str statistics):
-    result = getattr(
-        plc.io.types.StatisticsFreq,
-        f"STATISTICS_{statistics.upper()}",
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `statistics_freq` type")
-    return result
-
-
-cdef compression_type _get_comp_type(object compression):
-    if compression is None:
-        return plc.io.types.CompressionType.NONE
-    result = getattr(
-        plc.io.types.CompressionType,
-        str(compression).upper(),
-        None
-    )
-    if result is None:
-        raise ValueError("Unsupported `compression` type")
-    return result
-
-
-cdef _set_col_metadata(
-    Column col,
-    ColumnInMetadata col_meta,
-    bool force_nullable_schema=False,
-    str path=None,
-    object skip_compression=None,
-    object column_encoding=None,
-    object column_type_length=None,
-    object output_as_binary=None,
-):
-    need_path = (skip_compression is not None or column_encoding is not None or
-                 column_type_length is not None or output_as_binary is not None)
-    name = col_meta.get_name() if need_path else None
-    full_path = path + "." + name if path is not None else name
-
-    if force_nullable_schema:
-        # Only set nullability if `force_nullable_schema`
-        # is true.
-        col_meta.set_nullability(True)
-
-    if skip_compression is not None and full_path in skip_compression:
-        col_meta.set_skip_compression(True)
-
-    if column_encoding is not None and full_path in column_encoding:
-        encoding = column_encoding[full_path]
-        if encoding is None:
-            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
-        else:
-            enc = str(encoding).upper()
-            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
-            if c_encoding is None:
-                raise ValueError("Unsupported `column_encoding` type")
-        col_meta.set_encoding(c_encoding)
-
-    if column_type_length is not None and full_path in column_type_length:
-        col_meta.set_output_as_binary(True)
-        col_meta.set_type_length(column_type_length[full_path])
-
-    if output_as_binary is not None and full_path in output_as_binary:
-        col_meta.set_output_as_binary(True)
-
-    if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            col_meta.child(i).set_name(name)
-            _set_col_metadata(
-                child_col,
-                col_meta.child(i),
-                force_nullable_schema,
-                full_path,
-                skip_compression,
-                column_encoding,
-                column_type_length,
-                output_as_binary
-            )
-    elif isinstance(col.dtype, cudf.ListDtype):
-        if full_path is not None:
-            full_path = full_path + ".list"
-            col_meta.child(1).set_name("element")
-        _set_col_metadata(
-            col.children[1],
-            col_meta.child(1),
-            force_nullable_schema,
-            full_path,
-            skip_compression,
-            column_encoding,
-            column_type_length,
-            output_as_binary
-        )
-    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
-        col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
deleted file mode 100644
index 2850cab93a1..00000000000
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-import warnings
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
-
-import pylibcudf
-
-from cudf.core._internals.aggregation import make_aggregation
-
-
-@acquire_spill_lock()
-def reduce(reduction_op, Column incol, dtype=None, **kwargs):
-    """
-    Top level Cython reduce function wrapping libcudf reductions.
-
-    Parameters
-    ----------
-    reduction_op : string
-        A string specifying the operation, e.g. sum, prod
-    incol : Column
-        A cuDF Column object
-    dtype: numpy.dtype, optional
-        A numpy data type to use for the output, defaults
-        to the same type as the input column
-    """
-    if dtype is not None:
-        warnings.warn(
-            "dtype is deprecated and will be remove in a future release. "
-            "Cast the result (e.g. .astype) after the operation instead.",
-            FutureWarning
-        )
-        col_dtype = dtype
-    else:
-        col_dtype = incol._reduction_result_dtype(reduction_op)
-
-    # check empty case
-    if len(incol) <= incol.null_count:
-        if reduction_op == 'sum' or reduction_op == 'sum_of_squares':
-            return incol.dtype.type(0)
-        if reduction_op == 'product':
-            return incol.dtype.type(1)
-        if reduction_op == "any":
-            return False
-
-        return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
-
-    result = pylibcudf.reduce.reduce(
-        incol.to_pylibcudf(mode="read"),
-        make_aggregation(reduction_op, kwargs).c_obj,
-        dtype_to_pylibcudf_type(col_dtype),
-    )
-
-    if is_decimal_type_id(result.type().id()):
-        scale = -result.type().scale()
-        precision = _reduce_precision(col_dtype, reduction_op, len(incol))
-        return DeviceScalar.from_pylibcudf(
-            result,
-            dtype=col_dtype.__class__(precision, scale),
-        ).value
-    scalar = DeviceScalar.from_pylibcudf(result).value
-    if isinstance(col_dtype, cudf.StructDtype):
-        # TODO: Utilize column_metadata in libcudf to maintain field labels
-        return dict(zip(col_dtype.fields.keys(), scalar.values()))
-    return scalar
-
-
-@acquire_spill_lock()
-def scan(scan_op, Column incol, inclusive, **kwargs):
-    """
-    Top level Cython scan function wrapping libcudf scans.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-    scan_op : string
-        A string specifying the operation, e.g. cumprod
-    inclusive: bool
-        Flag for including nulls in relevant scan
-    """
-    return Column.from_pylibcudf(
-        pylibcudf.reduce.scan(
-            incol.to_pylibcudf(mode="read"),
-            make_aggregation(scan_op, kwargs).c_obj,
-            pylibcudf.reduce.ScanType.INCLUSIVE if inclusive
-            else pylibcudf.reduce.ScanType.EXCLUSIVE,
-        )
-    )
-
-
-@acquire_spill_lock()
-def minmax(Column incol):
-    """
-    Top level Cython minmax function wrapping libcudf minmax.
-
-    Parameters
-    ----------
-    incol : Column
-        A cuDF Column object
-
-    Returns
-    -------
-    A pair of ``(min, max)`` values of ``incol``
-    """
-    min, max = pylibcudf.reduce.minmax(incol.to_pylibcudf(mode="read"))
-    return (
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(min)),
-        cudf.Scalar.from_device_scalar(DeviceScalar.from_pylibcudf(max)),
-    )
-
-
-def _reduce_precision(dtype, op, nrows):
-    """
-    Returns the result precision when performing the reduce
-    operation `op` for the given dtype and column size.
-
-    See: https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-    """  # noqa: E501
-    p = dtype.precision
-    if op in ("min", "max"):
-        new_p = p
-    elif op == "sum":
-        new_p = p + nrows - 1
-    elif op == "product":
-        new_p = p * nrows + nrows - 1
-    elif op == "sum_of_squares":
-        new_p = 2 * p + nrows
-    else:
-        raise NotImplementedError()
-    return max(min(new_p, dtype.MAX_PRECISION), 0)
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
deleted file mode 100644
index f961c09e6f6..00000000000
--- a/python/cudf/cudf/_lib/round.pyx
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-from pylibcudf.round import RoundingMethod
-
-
-@acquire_spill_lock()
-def round(Column input_col, int decimal_places=0, how="half_even"):
-    """
-    Round column values to the given number of decimal places
-
-    Parameters
-    ----------
-    input_col : Column whose values will be rounded
-    decimal_places : The number or decimal places to round to
-
-    Returns
-    -------
-    A Column with values rounded to the given number of decimal places
-    """
-    if how not in {"half_even", "half_up"}:
-        raise ValueError("'how' must be either 'half_even' or 'half_up'")
-
-    how = (
-        RoundingMethod.HALF_EVEN if how == "half_even"
-        else RoundingMethod.HALF_UP
-    )
-
-    return Column.from_pylibcudf(
-        plc.round.round(
-            input_col.to_pylibcudf(mode="read"),
-            decimal_places,
-            how
-        )
-    )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 0f9820ed1db..a3a8a14e70f 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -17,9 +17,6 @@ cdef class DeviceScalar:
     @staticmethod
     cdef DeviceScalar from_unique_ptr(unique_ptr[scalar] ptr, dtype=*)
 
-    @staticmethod
-    cdef DeviceScalar from_pylibcudf(pscalar, dtype=*)
-
     cdef void _set_dtype(self, dtype=*)
 
     cpdef bool is_valid(DeviceScalar s)
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 56712402919..fd6d0257940 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -10,24 +10,22 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
-from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
+from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES
+from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 from cudf.core.missing import NA, NaT
 
-cimport pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID
 from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar
 
-from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
-
 
 def _replace_nested(obj, check, replacement):
     if isinstance(obj, list):
@@ -62,12 +60,12 @@ def gather_metadata(dtypes):
     """
     out = []
     for name, dtype in dtypes.items():
-        v = pylibcudf.interop.ColumnMetadata(name)
+        v = plc.interop.ColumnMetadata(name)
         if isinstance(dtype, cudf.StructDtype):
             v.children_meta = gather_metadata(dtype.fields)
         elif isinstance(dtype, cudf.ListDtype):
             # Offsets column is unnamed and has no children
-            v.children_meta.append(pylibcudf.interop.ColumnMetadata(""))
+            v.children_meta.append(plc.interop.ColumnMetadata(""))
             v.children_meta.extend(
                 gather_metadata({"": dtype.element_type})
             )
@@ -81,7 +79,7 @@ cdef class DeviceScalar:
     # that from_unique_ptr is implemented is probably dereferencing this in an
     # invalid state. See what the best way to fix that is.
     def __cinit__(self, *args, **kwargs):
-        self.c_value = pylibcudf.Scalar.__new__(pylibcudf.Scalar)
+        self.c_value = plc.Scalar.__new__(plc.Scalar)
 
     def __init__(self, value, dtype):
         """
@@ -127,20 +125,20 @@ cdef class DeviceScalar:
             pa_array = pa.array([pa.scalar(value, type=pa_type)])
 
         pa_table = pa.Table.from_arrays([pa_array], names=[""])
-        table = pylibcudf.interop.from_arrow(pa_table)
+        table = plc.interop.from_arrow(pa_table)
 
         column = table.columns()[0]
         if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL32, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL32, -dtype.scale)
                 )
             elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype):
-                column = pylibcudf.unary.cast(
-                    column, pylibcudf.DataType(pylibcudf.TypeId.DECIMAL64, -dtype.scale)
+                column = plc.unary.cast(
+                    column, plc.DataType(plc.TypeId.DECIMAL64, -dtype.scale)
                 )
 
-        self.c_value = pylibcudf.copying.get_element(column, 0)
+        self.c_value = plc.copying.get_element(column, 0)
         self._dtype = dtype
 
     def _to_host_scalar(self):
@@ -150,7 +148,7 @@ cdef class DeviceScalar:
         null_type = NaT if is_datetime or is_timedelta else NA
 
         metadata = gather_metadata({"": self.dtype})[0]
-        ps = pylibcudf.interop.to_arrow(self.c_value, metadata)
+        ps = plc.interop.to_arrow(self.c_value, metadata)
         if not ps.is_valid:
             return null_type
 
@@ -218,41 +216,40 @@ cdef class DeviceScalar:
         return s
 
     @staticmethod
-    cdef DeviceScalar from_pylibcudf(pscalar, dtype=None):
+    def from_pylibcudf(pscalar, dtype=None):
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         s.c_value = pscalar
         s._set_dtype(dtype)
         return s
 
     cdef void _set_dtype(self, dtype=None):
-        cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type()
-
+        cdef plc_TypeID cdtype_id = self.c_value.type().id()
         if dtype is not None:
             self._dtype = dtype
-        elif cdtype.id() in {
-            libcudf_types.type_id.DECIMAL32,
-            libcudf_types.type_id.DECIMAL64,
-            libcudf_types.type_id.DECIMAL128,
+        elif cdtype_id in {
+            plc_TypeID.DECIMAL32,
+            plc_TypeID.DECIMAL64,
+            plc_TypeID.DECIMAL128,
         }:
             raise TypeError(
                 "Must pass a dtype when constructing from a fixed-point scalar"
             )
-        elif cdtype.id() == libcudf_types.type_id.STRUCT:
+        elif cdtype_id == plc_TypeID.STRUCT:
             struct_table_view = (<struct_scalar*>self.get_raw_ptr())[0].view()
             self._dtype = StructDtype({
                 str(i): dtype_from_column_view(struct_table_view.column(i))
                 for i in range(struct_table_view.num_columns())
             })
-        elif cdtype.id() == libcudf_types.type_id.LIST:
+        elif cdtype_id == plc_TypeID.LIST:
             if (
                 <list_scalar*>self.get_raw_ptr()
-            )[0].view().type().id() == libcudf_types.type_id.LIST:
+            )[0].view().type().id() == plc_TypeID.LIST:
                 self._dtype = dtype_from_column_view(
                     (<list_scalar*>self.get_raw_ptr())[0].view()
                 )
             else:
                 self._dtype = ListDtype(
-                    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                    PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                         <underlying_type_t_type_id>(
                             (<list_scalar*>self.get_raw_ptr())[0]
                             .view().type().id()
@@ -260,29 +257,6 @@ cdef class DeviceScalar:
                     ]
                 )
         else:
-            self._dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-                <underlying_type_t_type_id>(cdtype.id())
+            self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+                <underlying_type_t_type_id>(cdtype_id)
             ]
-
-
-def as_device_scalar(val, dtype=None):
-    if isinstance(val, (cudf.Scalar, DeviceScalar)):
-        if dtype == val.dtype or dtype is None:
-            if isinstance(val, DeviceScalar):
-                return val
-            else:
-                return val.device_value
-        else:
-            raise TypeError("Can't update dtype of existing GPU scalar")
-    else:
-        return cudf.Scalar(val, dtype=dtype).device_value
-
-
-def _is_null_host_scalar(slr):
-    if cudf.utils.utils.is_na_like(slr):
-        return True
-    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
-            slr is pd.NaT:
-        return True
-    else:
-        return False
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
deleted file mode 100644
index eefe37d9880..00000000000
--- a/python/cudf/cudf/_lib/sort.pyx
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from itertools import repeat
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def is_sorted(
-    list source_columns, object ascending=None, object null_position=None
-):
-    """
-    Checks whether the rows of a `table` are sorted in lexicographical order.
-
-    Parameters
-    ----------
-    source_columns : list of columns
-        columns to be checked for sort order
-    ascending : None or list-like of booleans
-        None or list-like of boolean values indicating expected sort order of
-        each column. If list-like, size of list-like must be len(columns). If
-        None, all columns expected sort order is set to ascending. False (0) -
-        descending, True (1) - ascending.
-    null_position : None or list-like of booleans
-        None or list-like of boolean values indicating desired order of nulls
-        compared to other elements. If list-like, size of list-like must be
-        len(columns). If None, null order is set to before. False (0) - after,
-        True (1) - before.
-
-    Returns
-    -------
-    returns : boolean
-        Returns True, if sorted as expected by ``ascending`` and
-        ``null_position``, False otherwise.
-    """
-
-    if ascending is None:
-        column_order = [pylibcudf.types.Order.ASCENDING] * len(source_columns)
-    else:
-        if len(ascending) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(ascending)} for `ascending`"
-            )
-        column_order = [pylibcudf.types.Order.DESCENDING] * len(source_columns)
-        for idx, val in enumerate(ascending):
-            if val:
-                column_order[idx] = pylibcudf.types.Order.ASCENDING
-
-    if null_position is None:
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-    else:
-        if len(null_position) != len(source_columns):
-            raise ValueError(
-                f"Expected a list-like of length {len(source_columns)}, "
-                f"got length {len(null_position)} for `null_position`"
-            )
-        null_precedence = [pylibcudf.types.NullOrder.AFTER] * len(source_columns)
-        for idx, val in enumerate(null_position):
-            if val:
-                null_precedence[idx] = pylibcudf.types.NullOrder.BEFORE
-
-    return pylibcudf.sorting.is_sorted(
-        pylibcudf.Table(
-            [c.to_pylibcudf(mode="read") for c in source_columns]
-        ),
-        column_order,
-        null_precedence
-    )
-
-
-def ordering(column_order, null_precedence):
-    """
-    Construct order and null order vectors
-
-    Parameters
-    ----------
-    column_order
-        Iterable of bool (True for ascending order, False for descending)
-    null_precedence
-        Iterable string for null positions ("first" for start, "last" for end)
-
-    Both iterables must be the same length (not checked)
-
-    Returns
-    -------
-    pair of vectors (order, and null_order)
-    """
-    c_column_order = []
-    c_null_precedence = []
-    for asc, null in zip(column_order, null_precedence):
-        c_column_order.append(
-            pylibcudf.types.Order.ASCENDING if asc else pylibcudf.types.Order.DESCENDING
-        )
-        if asc ^ (null == "first"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.AFTER)
-        elif asc ^ (null == "last"):
-            c_null_precedence.append(pylibcudf.types.NullOrder.BEFORE)
-        else:
-            raise ValueError(f"Invalid null precedence {null}")
-    return c_column_order, c_null_precedence
-
-
-@acquire_spill_lock()
-def order_by(
-    list columns_from_table,
-    object ascending,
-    str na_position,
-    *,
-    bool stable
-):
-    """
-    Get index to sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    columns_from_table : list[Column]
-        Columns from the table which will be sorted
-    ascending : sequence[bool]
-         Sequence of boolean values which correspond to each column
-         in the table to be sorted signifying the order of each column
-         True - Ascending and False - Descending
-    na_position : str
-        Whether null values should show up at the "first" or "last"
-        position of **all** sorted column.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    Column of indices that sorts the table
-    """
-    order = ordering(ascending, repeat(na_position))
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sorted_order")
-
-    return Column.from_pylibcudf(
-        func(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in columns_from_table],
-            ),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort(
-    list values,
-    list column_order=None,
-    list null_precedence=None,
-):
-    """
-    Sort the table in ascending/descending order.
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    return columns_from_pylibcudf_table(
-        pylibcudf.sorting.sort(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def sort_by_key(
-    list values,
-    list keys,
-    object ascending,
-    object na_position,
-    *,
-    bool stable,
-):
-    """
-    Sort a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    ascending : list[bool]
-        Sequence of boolean values which correspond to each column
-        in the table to be sorted signifying the order of each column
-        True - Ascending and False - Descending
-    na_position : list[str]
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    order = ordering(ascending, na_position)
-    func = getattr(pylibcudf.sorting, f"{'stable_' if stable else ''}sort_by_key")
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def segmented_sort_by_key(
-    list values,
-    list keys,
-    Column segment_offsets,
-    list column_order=None,
-    list null_precedence=None,
-    *,
-    bool stable,
-):
-    """
-    Sort segments of a table by given keys
-
-    Parameters
-    ----------
-    values : list[Column]
-        Columns of the table which will be sorted
-    keys : list[Column]
-        Columns making up the sort key
-    offsets : Column
-        Segment offsets
-    column_order : list[bool], optional
-        Sequence of boolean values which correspond to each column in
-        keys providing the sort order (default all True).
-        With True <=> ascending; False <=> descending.
-    null_precedence : list[str], optional
-        Sequence of "first" or "last" values (default "first")
-        indicating the position of null values when sorting the keys.
-    stable : bool
-        Should the sort be stable? (no default)
-
-    Returns
-    -------
-    list[Column]
-        list of value columns sorted by keys
-    """
-    ncol = len(values)
-    order = ordering(
-        column_order or repeat(True, ncol),
-        null_precedence or repeat("first", ncol),
-    )
-    func = getattr(
-        pylibcudf.sorting,
-        f"{'stable_' if stable else ''}segmented_sort_by_key"
-    )
-    return columns_from_pylibcudf_table(
-        func(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in values]),
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in keys]),
-            segment_offsets.to_pylibcudf(mode="read"),
-            order[0],
-            order[1],
-        )
-    )
-
-
-@acquire_spill_lock()
-def digitize(list source_columns, list bins, bool right=False):
-    """
-    Return the indices of the bins to which each value in source_table belongs.
-
-    Parameters
-    ----------
-    source_columns : Input columns to be binned.
-    bins : List containing columns of bins
-    right : Indicating whether the intervals include the
-            right or the left bin edge.
-    """
-    return Column.from_pylibcudf(
-        getattr(pylibcudf.search, "lower_bound" if right else "upper_bound")(
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in bins]
-            ),
-            pylibcudf.Table(
-                [c.to_pylibcudf(mode="read") for c in source_columns]
-            ),
-            [pylibcudf.types.Order.ASCENDING]*len(bins),
-            [pylibcudf.types.NullOrder.BEFORE]*len(bins)
-        )
-    )
-
-
-@acquire_spill_lock()
-def rank_columns(list source_columns, rank_method method, str na_option,
-                 bool ascending, bool pct
-                 ):
-    """
-    Compute numerical data ranks (1 through n) of each column in the dataframe
-    """
-    column_order = (
-        pylibcudf.types.Order.ASCENDING
-        if ascending
-        else pylibcudf.types.Order.DESCENDING
-    )
-    # ascending
-    #    #top    = na_is_smallest
-    #    #bottom = na_is_largest
-    #    #keep   = na_is_largest
-    # descending
-    #    #top    = na_is_largest
-    #    #bottom = na_is_smallest
-    #    #keep   = na_is_smallest
-    if ascending:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-        else:
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-    else:
-        if na_option == 'top':
-            null_precedence = pylibcudf.types.NullOrder.AFTER
-        else:
-            null_precedence = pylibcudf.types.NullOrder.BEFORE
-    c_null_handling = (
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if na_option == 'keep'
-        else pylibcudf.types.NullPolicy.INCLUDE
-    )
-
-    return [
-        Column.from_pylibcudf(
-            pylibcudf.sorting.rank(
-                col.to_pylibcudf(mode="read"),
-                method,
-                column_order,
-                c_null_handling,
-                null_precedence,
-                pct,
-            )
-        )
-        for col in source_columns
-    ]
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
deleted file mode 100644
index 1b8831940e3..00000000000
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from libcpp cimport bool
-
-from cudf._lib.column cimport Column
-from cudf._lib.utils cimport columns_from_pylibcudf_table
-
-import pylibcudf
-
-
-@acquire_spill_lock()
-def drop_nulls(list columns, how="any", keys=None, thresh=None):
-    """
-    Drops null rows from cols depending on key columns.
-
-    Parameters
-    ----------
-    columns : list of columns
-    how  : "any" or "all". If thresh is None, drops rows of cols that have any
-           nulls or all nulls (respectively) in subset (default: "any")
-    keys : List of column indices. If set, then these columns are checked for
-           nulls rather than all of columns (optional)
-    thresh : Minimum number of non-nulls required to keep a row (optional)
-
-    Returns
-    -------
-    columns with null rows dropped
-    """
-    if how not in {"any", "all"}:
-        raise ValueError("how must be 'any' or 'all'")
-
-    keys = list(keys if keys is not None else range(len(columns)))
-
-    # Note: If how == "all" and thresh is specified this prioritizes thresh
-    if thresh is not None:
-        keep_threshold = thresh
-    elif how == "all":
-        keep_threshold = 1
-    else:
-        keep_threshold = len(keys)
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.drop_nulls(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            keys,
-            keep_threshold,
-        )
-    )
-
-
-@acquire_spill_lock()
-def apply_boolean_mask(list columns, Column boolean_mask):
-    """
-    Drops the rows which correspond to False in boolean_mask.
-
-    Parameters
-    ----------
-    columns : list of columns whose rows are dropped as per boolean_mask
-    boolean_mask : a boolean column of same size as source_table
-
-    Returns
-    -------
-    columns obtained from applying mask
-    """
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.apply_boolean_mask(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            boolean_mask.to_pylibcudf(mode="read"),
-        )
-    )
-
-
-_keep_options = {
-    "first": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-    "last": pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-    False: pylibcudf.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-}
-
-
-@acquire_spill_lock()
-def drop_duplicates(list columns,
-                    object keys=None,
-                    object keep='first',
-                    bool nulls_are_equal=True):
-    """
-    Drops rows in source_table as per duplicate rows in keys.
-
-    Parameters
-    ----------
-    columns : List of columns
-    keys : List of column indices. If set, then these columns are checked for
-           duplicates rather than all of columns (optional)
-    keep : keep 'first' or 'last' or none of the duplicate rows
-    nulls_are_equal : if True, nulls are treated equal else not.
-
-    Returns
-    -------
-    columns with duplicate dropped
-    """
-    if (keep_option := _keep_options.get(keep)) is None:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return columns_from_pylibcudf_table(
-        pylibcudf.stream_compaction.stable_distinct(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            list(keys if keys is not None else range(len(columns))),
-            keep_option,
-            pylibcudf.types.NullEquality.EQUAL
-            if nulls_are_equal else pylibcudf.types.NullEquality.UNEQUAL,
-            pylibcudf.types.NanEquality.ALL_EQUAL,
-        )
-    )
-
-
-@acquire_spill_lock()
-def distinct_indices(
-    list columns,
-    object keep="first",
-    bool nulls_equal=True,
-    bool nans_equal=True,
-):
-    """
-    Return indices of the distinct rows in a table.
-
-    Parameters
-    ----------
-    columns : list of columns to check for duplicates
-    keep : treat "first", "last", or (False) none of any duplicate
-        rows as distinct
-    nulls_equal : Should nulls compare equal
-    nans_equal: Should nans compare equal
-
-    Returns
-    -------
-    Column of indices
-
-    See Also
-    --------
-    drop_duplicates
-    """
-    if (keep_option := _keep_options.get(keep)) is None:
-        raise ValueError('keep must be either "first", "last" or False')
-
-    return Column.from_pylibcudf(
-        pylibcudf.stream_compaction.distinct_indices(
-            pylibcudf.Table([c.to_pylibcudf(mode="read") for c in columns]),
-            keep_option,
-            pylibcudf.types.NullEquality.EQUAL
-            if nulls_equal else pylibcudf.types.NullEquality.UNEQUAL,
-            pylibcudf.types.NanEquality.ALL_EQUAL
-            if nans_equal else pylibcudf.types.NanEquality.UNEQUAL,
-        )
-    )
-
-
-@acquire_spill_lock()
-def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
-    """
-    Finds number of unique rows in `source_column`
-
-    Parameters
-    ----------
-    source_column : source table checked for unique rows
-    ignore_nulls : If True nulls are ignored,
-                   else counted as one more distinct value
-    nan_as_null  : If True, NAN is considered NULL,
-                   else counted as one more distinct value
-
-    Returns
-    -------
-    Count of number of unique rows in `source_column`
-    """
-    return pylibcudf.stream_compaction.distinct_count(
-        source_column.to_pylibcudf(mode="read"),
-        pylibcudf.types.NullPolicy.EXCLUDE
-        if ignore_nulls else pylibcudf.types.NullPolicy.INCLUDE,
-        pylibcudf.types.NanPolicy.NAN_IS_NULL
-        if nan_as_null else pylibcudf.types.NanPolicy.NAN_IS_VALID,
-    )
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
deleted file mode 100644
index 06ee07d8e2b..00000000000
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ /dev/null
@@ -1,598 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-from pylibcudf.types cimport DataType
-
-from cudf._lib.scalar import as_device_scalar
-
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-
-def floating_to_string(Column input_col):
-    plc_column = plc.strings.convert.convert_floats.from_floats(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def string_to_floating(Column input_col, DataType out_type):
-    plc_column = plc.strings.convert.convert_floats.to_floats(
-        input_col.to_pylibcudf(mode="read"),
-        out_type
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def dtos(Column input_col):
-    """
-    Converting/Casting input column of type double to string column
-
-    Parameters
-    ----------
-    input_col : input column of type double
-
-    Returns
-    -------
-    A Column with double values cast to string
-    """
-
-    return floating_to_string(input_col)
-
-
-def stod(Column input_col):
-    """
-    Converting/Casting input column of type string to double
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to double
-    """
-
-    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT64))
-
-
-def ftos(Column input_col):
-    """
-    Converting/Casting input column of type float to string column
-
-    Parameters
-    ----------
-    input_col : input column of type double
-
-    Returns
-    -------
-    A Column with float values cast to string
-    """
-
-    return floating_to_string(input_col)
-
-
-def stof(Column input_col):
-    """
-    Converting/Casting input column of type string to float
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to float
-    """
-
-    return string_to_floating(input_col, plc.DataType(plc.TypeId.FLOAT32))
-
-
-def integer_to_string(Column input_col):
-    plc_column = plc.strings.convert.convert_integers.from_integers(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def string_to_integer(Column input_col, DataType out_type):
-    plc_column = plc.strings.convert.convert_integers.to_integers(
-        input_col.to_pylibcudf(mode="read"),
-        out_type
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def i8tos(Column input_col):
-    """
-    Converting/Casting input column of type int8 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int8
-
-    Returns
-    -------
-    A Column with int8 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi8(Column input_col):
-    """
-    Converting/Casting input column of type string to int8
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int8
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT8))
-
-
-def i16tos(Column input_col):
-    """
-    Converting/Casting input column of type int16 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int16
-
-    Returns
-    -------
-    A Column with int16 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi16(Column input_col):
-    """
-    Converting/Casting input column of type string to int16
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int16
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT16))
-
-
-def itos(Column input_col):
-    """
-    Converting/Casting input column of type int32 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int32
-
-    Returns
-    -------
-    A Column with int32 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoi(Column input_col):
-    """
-    Converting/Casting input column of type string to int32
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int32
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT32))
-
-
-def ltos(Column input_col):
-    """
-    Converting/Casting input column of type int64 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type int64
-
-    Returns
-    -------
-    A Column with int64 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stol(Column input_col):
-    """
-    Converting/Casting input column of type string to int64
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to int64
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.INT64))
-
-
-def ui8tos(Column input_col):
-    """
-    Converting/Casting input column of type uint8 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint8
-
-    Returns
-    -------
-    A Column with uint8 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui8(Column input_col):
-    """
-    Converting/Casting input column of type string to uint8
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint8
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT8))
-
-
-def ui16tos(Column input_col):
-    """
-    Converting/Casting input column of type uint16 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint16
-
-    Returns
-    -------
-    A Column with uint16 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui16(Column input_col):
-    """
-    Converting/Casting input column of type string to uint16
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint16
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT16))
-
-
-def uitos(Column input_col):
-    """
-    Converting/Casting input column of type uint32 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint32
-
-    Returns
-    -------
-    A Column with uint32 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoui(Column input_col):
-    """
-    Converting/Casting input column of type string to uint32
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint32
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT32))
-
-
-def ultos(Column input_col):
-    """
-    Converting/Casting input column of type uint64 to string column
-
-    Parameters
-    ----------
-    input_col : input column of type uint64
-
-    Returns
-    -------
-    A Column with uint64 values cast to string
-    """
-
-    return integer_to_string(input_col)
-
-
-def stoul(Column input_col):
-    """
-    Converting/Casting input column of type string to uint64
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with strings cast to uint64
-    """
-
-    return string_to_integer(input_col, plc.DataType(plc.TypeId.UINT64))
-
-
-def to_booleans(Column input_col):
-    plc_column = plc.strings.convert.convert_booleans.to_booleans(
-        input_col.to_pylibcudf(mode="read"),
-        as_device_scalar("True").c_value,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def from_booleans(Column input_col):
-    plc_column = plc.strings.convert.convert_booleans.from_booleans(
-        input_col.to_pylibcudf(mode="read"),
-        as_device_scalar("True").c_value,
-        as_device_scalar("False").c_value,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def int2timestamp(
-        Column input_col,
-        str format,
-        Column names):
-    """
-    Converting/Casting input date-time column to string
-    column with specified format
-
-    Parameters
-    ----------
-    input_col : input column of type timestamp in integer format
-    format : The string specifying output format
-    names : The string names to use for weekdays ("%a", "%A") and
-    months ("%b", "%B")
-
-    Returns
-    -------
-    A Column with date-time represented in string format
-
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_datetime.from_timestamps(
-            input_col.to_pylibcudf(mode="read"),
-            format,
-            names.to_pylibcudf(mode="read")
-        )
-    )
-
-
-def timestamp2int(Column input_col, dtype, format):
-    """
-    Converting/Casting input string column to date-time column with specified
-    timestamp_format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with string represented in date-time format
-
-    """
-    dtype = dtype_to_pylibcudf_type(dtype)
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_datetime.to_timestamps(
-            input_col.to_pylibcudf(mode="read"),
-            dtype,
-            format
-        )
-    )
-
-
-def istimestamp(Column input_col, str format):
-    """
-    Check input string column matches the specified timestamp format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    format : format string of timestamp specifiers
-
-    Returns
-    -------
-    A Column of boolean values identifying strings that matched the format.
-
-    """
-    plc_column = plc.strings.convert.convert_datetime.is_timestamp(
-        input_col.to_pylibcudf(mode="read"),
-        format
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def timedelta2int(Column input_col, dtype, format):
-    """
-    Converting/Casting input string column to TimeDelta column with specified
-    format
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column with string represented in TimeDelta format
-
-    """
-    dtype = dtype_to_pylibcudf_type(dtype)
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_durations.to_durations(
-            input_col.to_pylibcudf(mode="read"),
-            dtype,
-            format
-        )
-    )
-
-
-def int2timedelta(Column input_col, str format):
-    """
-    Converting/Casting input Timedelta column to string
-    column with specified format
-
-    Parameters
-    ----------
-    input_col : input column of type Timedelta in integer format
-
-    Returns
-    -------
-    A Column with Timedelta represented in string format
-
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_durations.from_durations(
-            input_col.to_pylibcudf(mode="read"),
-            format
-        )
-    )
-
-
-def int2ip(Column input_col):
-    """
-    Converting/Casting integer column to string column in ipv4 format
-
-    Parameters
-    ----------
-    input_col : input integer column
-
-    Returns
-    -------
-    A Column with integer represented in string ipv4 format
-
-    """
-    plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
-        input_col.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def ip2int(Column input_col):
-    """
-    Converting string ipv4 column to integer column
-
-    Parameters
-    ----------
-    input_col : input string column
-
-    Returns
-    -------
-    A Column with ipv4 represented as integer
-
-    """
-    plc_column = plc.strings.convert.convert_ipv4.ipv4_to_integers(
-        input_col.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def is_ipv4(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have strings in IPv4 format. This format is nnn.nnn.nnn.nnn
-    where nnn is integer digits in [0,255].
-    """
-    plc_column = plc.strings.convert.convert_ipv4.is_ipv4(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def htoi(Column input_col):
-    """
-    Converting input column of type string having hex values
-    to integer of out_type
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    A Column of integers parsed from hexadecimal string values.
-    """
-    plc_column = plc.strings.convert.convert_integers.hex_to_integers(
-        input_col.to_pylibcudf(mode="read"),
-        plc.DataType(plc.TypeId.INT64)
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def is_hex(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have hex characters.
-    """
-    plc_column = plc.strings.convert.convert_integers.is_hex(
-        source_strings.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def itoh(Column input_col):
-    """
-    Converting input column of type integer to a string
-    column with hexadecimal character digits.
-
-    Parameters
-    ----------
-    input_col : input column of type integer
-
-    Returns
-    -------
-    A Column of strings with hexadecimal characters.
-    """
-    plc_column = plc.strings.convert.convert_integers.integers_to_hex(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
deleted file mode 100644
index dca9c4cc3fc..00000000000
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-add_subdirectory(convert)
-add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/__init__.pxd b/python/cudf/cudf/_lib/strings/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
deleted file mode 100644
index b795c54c112..00000000000
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
-from cudf._lib.nvtext.generate_ngrams import (
-    generate_character_ngrams,
-    generate_ngrams,
-    hash_character_ngrams,
-)
-from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import (
-    minhash,
-    minhash64,
-    minhash64_permuted,
-    minhash_permuted,
-    word_minhash,
-    word_minhash64,
-)
-from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
-from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
-from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
-from cudf._lib.nvtext.stemmer import (
-    LetterType,
-    is_letter,
-    is_letter_multi,
-    porter_stemmer_measure,
-)
-from cudf._lib.nvtext.tokenize import (
-    _count_tokens_column,
-    _count_tokens_scalar,
-    _tokenize_column,
-    _tokenize_scalar,
-    character_tokenize,
-    detokenize,
-    tokenize_with_vocabulary,
-)
-from cudf._lib.strings.convert.convert_fixed_point import to_decimal
-from cudf._lib.strings.convert.convert_floats import is_float
-from cudf._lib.strings.convert.convert_integers import is_integer
-from cudf._lib.strings.convert.convert_urls import url_decode, url_encode
-from cudf._lib.strings.split.partition import partition, rpartition
-from cudf._lib.strings.split.split import (
-    rsplit,
-    rsplit_re,
-    rsplit_record,
-    rsplit_record_re,
-    split,
-    split_re,
-    split_record,
-    split_record_re,
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
deleted file mode 100644
index e8a76b476a8..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources convert_fixed_point.pyx convert_floats.pyx convert_integers.pyx
-                   convert_lists.pyx convert_urls.pyx
-)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/strings/convert/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/__init__.py b/python/cudf/cudf/_lib/strings/convert/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
deleted file mode 100644
index 96dcd021c3b..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-from cudf._lib.types cimport dtype_to_pylibcudf_type
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def from_decimal(Column input_col):
-    """
-    Converts a `Decimal64Column` to a `StringColumn`.
-
-    Parameters
-    ----------
-    input_col : input column of type decimal
-
-    Returns
-    -------
-    A column of strings representing the input decimal values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.from_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def to_decimal(Column input_col, object out_type):
-    """
-    Returns a `Decimal64Column` from the provided `StringColumn`
-    using the scale in the `out_type`.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    out_type : The type and scale of the decimal column expected
-
-    Returns
-    -------
-    A column of decimals parsed from the string values.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(out_type),
-    )
-    result = Column.from_pylibcudf(plc_column)
-    result.dtype.precision = out_type.precision
-    return result
-
-
-@acquire_spill_lock()
-def is_fixed_point(Column input_col, object dtype):
-    """
-    Returns a Column of boolean values with True for `input_col`
-    that have fixed-point characters. The output row also has a
-    False value if the corresponding string would cause an integer
-    overflow. The scale of the `dtype` is used to determine overflow
-    in the output row.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-    dtype : The type and scale of a decimal column
-
-    Returns
-    -------
-    A Column of booleans indicating valid decimal conversion.
-    """
-    plc_column = plc.strings.convert.convert_fixed_point.is_fixed_point(
-        input_col.to_pylibcudf(mode="read"),
-        dtype_to_pylibcudf_type(dtype),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
deleted file mode 100644
index 5da6e3f10cc..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def is_float(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have floats.
-    """
-    plc_column = plc.strings.convert.convert_floats.is_float(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
deleted file mode 100644
index 50113347ccb..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def is_integer(Column source_strings):
-    """
-    Returns a Column of boolean values with True for `source_strings`
-    that have integers.
-    """
-    return Column.from_pylibcudf(
-        plc.strings.convert.convert_integers.is_integer(
-            source_strings.to_pylibcudf(mode="read")
-        )
-    )
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
deleted file mode 100644
index 3a2cb4bd5c7..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-from cudf._lib.scalar import as_device_scalar
-
-
-@acquire_spill_lock()
-def format_list_column(Column source_list, Column separators):
-    """
-    Format a list column of strings into a strings column.
-
-    Parameters
-    ----------
-    input_col : input column of type list with strings child.
-
-    separators: strings used for formatting (', ', '[', ']')
-
-    Returns
-    -------
-    Formatted strings column
-    """
-    plc_column = plc.strings.convert.convert_lists.format_list_column(
-        source_list.to_pylibcudf(mode="read"),
-        as_device_scalar("None").c_value,
-        separators.to_pylibcudf(mode="read"),
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
deleted file mode 100644
index d5c2f771970..00000000000
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import pylibcudf as plc
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-
-@acquire_spill_lock()
-def url_decode(Column source_strings):
-    """
-    Decode each string in column. No format checking is performed.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL decoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_decode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def url_encode(Column source_strings):
-    """
-    Encode each string in column. No format checking is performed.
-    All characters are encoded except for ASCII letters, digits,
-    and these characters: '.','_','-','~'. Encoding converts to
-    hex using UTF-8 encoded bytes.
-
-    Parameters
-    ----------
-    input_col : input column of type string
-
-    Returns
-    -------
-    URL encoded string column
-    """
-    plc_column = plc.strings.convert.convert_urls.url_encode(
-        source_strings.to_pylibcudf(mode="read")
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
deleted file mode 100644
index 4ede0a2fac5..00000000000
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-set(cython_sources partition.pyx split.pyx)
-
-set(linked_libraries cudf::cudf)
-rapids_cython_create_modules(
-  CXX
-  SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
-)
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.pxd b/python/cudf/cudf/_lib/strings/split/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/__init__.py b/python/cudf/cudf/_lib/strings/split/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
deleted file mode 100644
index 5319addc41c..00000000000
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def partition(Column source_strings,
-              object py_delimiter):
-    """
-    Returns data by splitting the `source_strings`
-    column at the first occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.partition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rpartition(Column source_strings,
-               object py_delimiter):
-    """
-    Returns a Column by splitting the `source_strings`
-    column at the last occurrence of the specified `py_delimiter`.
-    """
-    plc_table = plc.strings.split.partition.rpartition(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
deleted file mode 100644
index 4ec6c7073d8..00000000000
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from cudf.core.buffer import acquire_spill_lock
-
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def split(Column source_strings,
-          object py_delimiter,
-          size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_table = plc.strings.split.split.split(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record(Column source_strings,
-                 object py_delimiter,
-                 size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from beginning.
-    """
-    plc_column = plc.strings.split.split.split_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit(Column source_strings,
-           object py_delimiter,
-           size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_table = plc.strings.split.split.rsplit(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_record(Column source_strings,
-                  object py_delimiter,
-                  size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the specified `py_delimiter`.
-    The split happens from the end.
-    """
-    plc_column = plc.strings.split.split.rsplit_record(
-        source_strings.to_pylibcudf(mode="read"),
-        py_delimiter.device_value.c_value,
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def split_re(Column source_strings,
-             object pattern,
-             size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_table = plc.strings.split.split.split_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def rsplit_re(Column source_strings,
-              object pattern,
-              size_type maxsplit):
-    """
-    Returns data by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_table = plc.strings.split.split.rsplit_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return dict(enumerate(Column.from_pylibcudf(col) for col in plc_table.columns()))
-
-
-@acquire_spill_lock()
-def split_record_re(Column source_strings,
-                    object pattern,
-                    size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    """
-    plc_column = plc.strings.split.split.split_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def rsplit_record_re(Column source_strings,
-                     object pattern,
-                     size_type maxsplit):
-    """
-    Returns a Column by splitting the `source_strings`
-    column around the delimiters identified by `pattern`.
-    The delimiters are searched starting from the end of each string.
-    """
-    plc_column = plc.strings.split.split.rsplit_record_re(
-        source_strings.to_pylibcudf(mode="read"),
-        plc.strings.regex_program.RegexProgram.create(
-            str(pattern),
-            plc.strings.regex_flags.RegexFlags.DEFAULT,
-        ),
-        maxsplit,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index dd2fafbe07f..83f0cb850a5 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
-
 from pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
@@ -27,6 +26,7 @@ from rmm.librmm.device_buffer cimport device_buffer
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 
 from cudf._lib.column cimport Column
+from pylibcudf cimport Column as plc_Column
 
 
 def get_cuda_build_version():
@@ -52,9 +52,9 @@ def column_from_udf_string_array(DeviceBuffer d_buffer):
         c_result = move(cpp_column_from_udf_string_array(data, size))
         cpp_free_udf_string_array(data, size)
 
-    result = Column.from_unique_ptr(move(c_result))
-
-    return result
+    return Column.from_pylibcudf(
+        plc_Column.from_libcudf(move(c_result))
+    )
 
 
 def get_character_flags_table_ptr():
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
deleted file mode 100644
index 7942d067c2b..00000000000
--- a/python/cudf/cudf/_lib/text.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from io import TextIOBase
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def read_text(object filepaths_or_buffers,
-              str delimiter,
-              object byte_range,
-              bool strip_delimiters,
-              object compression,
-              object compression_offsets):
-    """
-    Cython function to call into libcudf API, see `multibyte_split`.
-
-    See Also
-    --------
-    cudf.io.text.read_text
-    """
-    if compression is None:
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
-        else:
-            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
-    elif compression == "bgzip":
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            raise ValueError("bgzip compression requires a file path")
-        if compression_offsets is not None:
-            if len(compression_offsets) != 2:
-                raise ValueError(
-                    "compression offsets need to consist of two elements")
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-                compression_offsets[0],
-                compression_offsets[1]
-            )
-        else:
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-            )
-    else:
-        raise ValueError("Only bgzip compression is supported at the moment")
-
-    options = plc.io.text.ParseOptions(
-        byte_range=byte_range, strip_delimiters=strip_delimiters
-    )
-    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
deleted file mode 100644
index a163bb07888..00000000000
--- a/python/cudf/cudf/_lib/transform.pyx
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from numba.np import numpy_support
-
-import cudf
-from cudf.core.buffer import acquire_spill_lock, as_buffer
-from cudf.utils import cudautils
-
-from pylibcudf cimport transform as plc_transform
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-import pylibcudf as plc
-
-
-@acquire_spill_lock()
-def bools_to_mask(Column col):
-    """
-    Given an int8 (boolean) column, compress the data from booleans to bits and
-    return a Buffer
-    """
-    mask, _ = plc_transform.bools_to_mask(col.to_pylibcudf(mode="read"))
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
-    """
-    Given a mask buffer, returns a boolean column representng bit 0 -> False
-    and 1 -> True within range of [begin_bit, end_bit),
-    """
-    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
-        raise TypeError("mask_buffer is not an instance of "
-                        "cudf.core.buffer.Buffer")
-    plc_column = plc_transform.mask_to_bools(
-        mask_buffer.get_ptr(mode="read"), begin_bit, end_bit
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-@acquire_spill_lock()
-def nans_to_nulls(Column input):
-    mask, _ = plc_transform.nans_to_nulls(
-        input.to_pylibcudf(mode="read")
-    )
-    return as_buffer(mask)
-
-
-@acquire_spill_lock()
-def transform(Column input, op):
-    nb_type = numpy_support.from_dtype(input.dtype)
-    nb_signature = (nb_type,)
-    compiled_op = cudautils.compile_udf(op, nb_signature)
-    np_dtype = cudf.dtype(compiled_op[1])
-
-    plc_column = plc_transform.transform(
-        input.to_pylibcudf(mode="read"),
-        compiled_op[0],
-        plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
-        True
-    )
-    return Column.from_pylibcudf(plc_column)
-
-
-def table_encode(list source_columns):
-    plc_table, plc_column = plc_transform.encode(
-        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns])
-    )
-
-    return (
-        [Column.from_pylibcudf(col) for col in plc_table.columns()],
-        Column.from_pylibcudf(plc_column)
-    )
-
-
-def one_hot_encode(Column input_column, Column categories):
-    plc_table = plc_transform.one_hot_encode(
-        input_column.to_pylibcudf(mode="read"),
-        categories.to_pylibcudf(mode="read"),
-    )
-    result_columns = [
-        Column.from_pylibcudf(col, data_ptr_exposed=True)
-        for col in plc_table.columns()
-    ]
-    result_labels = [
-        x if x is not None else '<NA>'
-        for x in categories.to_arrow().to_pylist()
-    ]
-    return dict(zip(result_labels, result_columns))
-
-
-@acquire_spill_lock()
-def compute_column(list columns, tuple column_names, str expr):
-    """Compute a new column by evaluating an expression on a set of columns.
-
-    Parameters
-    ----------
-    columns : list
-        The set of columns forming the table to evaluate the expression on.
-    column_names : tuple[str]
-        The names associated with each column. These names are necessary to map
-        column names in the expression to indices in the provided list of
-        columns, which are what will be used by libcudf to evaluate the
-        expression on the table.
-    expr : str
-        The expression to evaluate.
-    """
-    result = plc_transform.compute_column(
-        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
-        plc.expressions.to_expression(expr, column_names),
-    )
-    return Column.from_pylibcudf(result)
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index c2b760490c1..18b1d26e4db 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,16 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-from libcpp cimport bool
 
-cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 ctypedef int32_t underlying_type_t_type_id
 
 cdef dtype_from_column_view(column_view cv)
 
-cdef libcudf_types.data_type dtype_to_data_type(dtype) except *
 cpdef dtype_to_pylibcudf_type(dtype)
-cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index f169ea12b10..777bd070b32 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from enum import IntEnum
-
 import numpy as np
 import pandas as pd
 
@@ -11,138 +9,46 @@ cimport pylibcudf.libcudf.types as libcudf_types
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 
 
-class TypeId(IntEnum):
-    EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
-    INT8 = <underlying_type_t_type_id> libcudf_types.type_id.INT8
-    INT16 = <underlying_type_t_type_id> libcudf_types.type_id.INT16
-    INT32 = <underlying_type_t_type_id> libcudf_types.type_id.INT32
-    INT64 = <underlying_type_t_type_id> libcudf_types.type_id.INT64
-    UINT8 = <underlying_type_t_type_id> libcudf_types.type_id.UINT8
-    UINT16 = <underlying_type_t_type_id> libcudf_types.type_id.UINT16
-    UINT32 = <underlying_type_t_type_id> libcudf_types.type_id.UINT32
-    UINT64 = <underlying_type_t_type_id> libcudf_types.type_id.UINT64
-    FLOAT32 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT32
-    FLOAT64 = <underlying_type_t_type_id> libcudf_types.type_id.FLOAT64
-    BOOL8 = <underlying_type_t_type_id> libcudf_types.type_id.BOOL8
-    TIMESTAMP_DAYS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_DAYS
-    )
-    TIMESTAMP_SECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_SECONDS
-    )
-    TIMESTAMP_MILLISECONDS = (
-        <underlying_type_t_type_id> (
-            libcudf_types.type_id.TIMESTAMP_MILLISECONDS
-        )
-    )
-    TIMESTAMP_MICROSECONDS = (
-        <underlying_type_t_type_id> (
-            libcudf_types.type_id.TIMESTAMP_MICROSECONDS
-        )
-    )
-    TIMESTAMP_NANOSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.TIMESTAMP_NANOSECONDS
-    )
-    DURATION_SECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_SECONDS
-    )
-    DURATION_MILLISECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MILLISECONDS
-    )
-    DURATION_MICROSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_MICROSECONDS
-    )
-    DURATION_NANOSECONDS = (
-        <underlying_type_t_type_id> libcudf_types.type_id.DURATION_NANOSECONDS
-    )
-    STRING = <underlying_type_t_type_id> libcudf_types.type_id.STRING
-    DECIMAL32 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL32
-    DECIMAL64 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL64
-    DECIMAL128 = <underlying_type_t_type_id> libcudf_types.type_id.DECIMAL128
-    STRUCT = <underlying_type_t_type_id> libcudf_types.type_id.STRUCT
-
-
-SUPPORTED_NUMPY_TO_LIBCUDF_TYPES = {
-    np.dtype("int8"): TypeId.INT8,
-    np.dtype("int16"): TypeId.INT16,
-    np.dtype("int32"): TypeId.INT32,
-    np.dtype("int64"): TypeId.INT64,
-    np.dtype("uint8"): TypeId.UINT8,
-    np.dtype("uint16"): TypeId.UINT16,
-    np.dtype("uint32"): TypeId.UINT32,
-    np.dtype("uint64"): TypeId.UINT64,
-    np.dtype("float32"): TypeId.FLOAT32,
-    np.dtype("float64"): TypeId.FLOAT64,
-    np.dtype("datetime64[s]"): TypeId.TIMESTAMP_SECONDS,
-    np.dtype("datetime64[ms]"): TypeId.TIMESTAMP_MILLISECONDS,
-    np.dtype("datetime64[us]"): TypeId.TIMESTAMP_MICROSECONDS,
-    np.dtype("datetime64[ns]"): TypeId.TIMESTAMP_NANOSECONDS,
-    np.dtype("object"): TypeId.STRING,
-    np.dtype("bool"): TypeId.BOOL8,
-    np.dtype("timedelta64[s]"): TypeId.DURATION_SECONDS,
-    np.dtype("timedelta64[ms]"): TypeId.DURATION_MILLISECONDS,
-    np.dtype("timedelta64[us]"): TypeId.DURATION_MICROSECONDS,
-    np.dtype("timedelta64[ns]"): TypeId.DURATION_NANOSECONDS,
-}
-
 SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = {
-    k: pylibcudf.TypeId(v).value
-    for k, v in SUPPORTED_NUMPY_TO_LIBCUDF_TYPES.items()
+    np.dtype("int8"): plc.types.TypeId.INT8,
+    np.dtype("int16"): plc.types.TypeId.INT16,
+    np.dtype("int32"): plc.types.TypeId.INT32,
+    np.dtype("int64"): plc.types.TypeId.INT64,
+    np.dtype("uint8"): plc.types.TypeId.UINT8,
+    np.dtype("uint16"): plc.types.TypeId.UINT16,
+    np.dtype("uint32"): plc.types.TypeId.UINT32,
+    np.dtype("uint64"): plc.types.TypeId.UINT64,
+    np.dtype("float32"): plc.types.TypeId.FLOAT32,
+    np.dtype("float64"): plc.types.TypeId.FLOAT64,
+    np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS,
+    np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS,
+    np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS,
+    np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS,
+    np.dtype("object"): plc.types.TypeId.STRING,
+    np.dtype("bool"): plc.types.TypeId.BOOL8,
+    np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS,
+    np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS,
+    np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS,
+    np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS,
 }
-
-LIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    # There's no equivalent to EMPTY in cudf.  We translate EMPTY
-    # columns from libcudf to ``int8`` columns of all nulls in Python.
-    # ``int8`` is chosen because it uses the least amount of memory.
-    TypeId.EMPTY: np.dtype("int8"),
-    TypeId.INT8: np.dtype("int8"),
-    TypeId.INT16: np.dtype("int16"),
-    TypeId.INT32: np.dtype("int32"),
-    TypeId.INT64: np.dtype("int64"),
-    TypeId.UINT8: np.dtype("uint8"),
-    TypeId.UINT16: np.dtype("uint16"),
-    TypeId.UINT32: np.dtype("uint32"),
-    TypeId.UINT64: np.dtype("uint64"),
-    TypeId.FLOAT32: np.dtype("float32"),
-    TypeId.FLOAT64: np.dtype("float64"),
-    TypeId.BOOL8: np.dtype("bool"),
-    TypeId.TIMESTAMP_SECONDS: np.dtype("datetime64[s]"),
-    TypeId.TIMESTAMP_MILLISECONDS: np.dtype("datetime64[ms]"),
-    TypeId.TIMESTAMP_MICROSECONDS: np.dtype("datetime64[us]"),
-    TypeId.TIMESTAMP_NANOSECONDS: np.dtype("datetime64[ns]"),
-    TypeId.DURATION_SECONDS: np.dtype("timedelta64[s]"),
-    TypeId.DURATION_MILLISECONDS: np.dtype("timedelta64[ms]"),
-    TypeId.DURATION_MICROSECONDS: np.dtype("timedelta64[us]"),
-    TypeId.DURATION_NANOSECONDS: np.dtype("timedelta64[ns]"),
-    TypeId.STRING: np.dtype("object"),
-    TypeId.STRUCT: np.dtype("object"),
-}
-
 PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = {
-    pylibcudf.TypeId(k).value: v
-    for k, v in LIBCUDF_TO_SUPPORTED_NUMPY_TYPES.items()
+    plc_type: np_type
+    for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items()
 }
+# There's no equivalent to EMPTY in cudf.  We translate EMPTY
+# columns from libcudf to ``int8`` columns of all nulls in Python.
+# ``int8`` is chosen because it uses the least amount of memory.
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object")
+PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object")
 
-duration_unit_map = {
-    TypeId.DURATION_SECONDS: "s",
-    TypeId.DURATION_MILLISECONDS: "ms",
-    TypeId.DURATION_MICROSECONDS: "us",
-    TypeId.DURATION_NANOSECONDS: "ns"
-}
-
-datetime_unit_map = {
-    TypeId.TIMESTAMP_SECONDS: "s",
-    TypeId.TIMESTAMP_MILLISECONDS: "ms",
-    TypeId.TIMESTAMP_MICROSECONDS: "us",
-    TypeId.TIMESTAMP_NANOSECONDS: "ns",
-}
 
-size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
 
 
 cdef dtype_from_lists_column_view(column_view cv):
@@ -190,71 +96,40 @@ cdef dtype_from_column_view(column_view cv):
             scale=-cv.type().scale()
         )
     else:
-        return LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
             <underlying_type_t_type_id>(tid)
         ]
 
-cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
-    # Note: This function is to be phased out in favor of
-    # dtype_to_pylibcudf_type which will return a pylibcudf
-    # DataType object
-    cdef libcudf_types.type_id tid
-    if isinstance(dtype, cudf.ListDtype):
-        tid = libcudf_types.type_id.LIST
-    elif isinstance(dtype, cudf.StructDtype):
-        tid = libcudf_types.type_id.STRUCT
-    elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = libcudf_types.type_id.DECIMAL128
-    elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = libcudf_types.type_id.DECIMAL64
-    elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = libcudf_types.type_id.DECIMAL32
-    else:
-        tid = <libcudf_types.type_id> (
-            <underlying_type_t_type_id> (
-                SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[np.dtype(dtype)]))
-
-    if is_decimal_type_id(tid):
-        return libcudf_types.data_type(tid, -dtype.scale)
-    else:
-        return libcudf_types.data_type(tid)
 
 cpdef dtype_to_pylibcudf_type(dtype):
     if isinstance(dtype, cudf.ListDtype):
-        return pylibcudf.DataType(pylibcudf.TypeId.LIST)
+        return plc.DataType(plc.TypeId.LIST)
     elif isinstance(dtype, cudf.StructDtype):
-        return pylibcudf.DataType(pylibcudf.TypeId.STRUCT)
+        return plc.DataType(plc.TypeId.STRUCT)
     elif isinstance(dtype, cudf.Decimal128Dtype):
-        tid = pylibcudf.TypeId.DECIMAL128
-        return pylibcudf.DataType(tid, -dtype.scale)
+        tid = plc.TypeId.DECIMAL128
+        return plc.DataType(tid, -dtype.scale)
     elif isinstance(dtype, cudf.Decimal64Dtype):
-        tid = pylibcudf.TypeId.DECIMAL64
-        return pylibcudf.DataType(tid, -dtype.scale)
+        tid = plc.TypeId.DECIMAL64
+        return plc.DataType(tid, -dtype.scale)
     elif isinstance(dtype, cudf.Decimal32Dtype):
-        tid = pylibcudf.TypeId.DECIMAL32
-        return pylibcudf.DataType(tid, -dtype.scale)
-    # libcudf types don't support localization so convert to the base type
+        tid = plc.TypeId.DECIMAL32
+        return plc.DataType(tid, -dtype.scale)
+    # libcudf types don't support timezones so convert to the base type
     elif isinstance(dtype, pd.DatetimeTZDtype):
         dtype = np.dtype(f"<M8[{dtype.unit}]")
     else:
         dtype = np.dtype(dtype)
-    return pylibcudf.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
-
-cdef bool is_decimal_type_id(libcudf_types.type_id tid) except *:
-    return tid in (
-        libcudf_types.type_id.DECIMAL128,
-        libcudf_types.type_id.DECIMAL64,
-        libcudf_types.type_id.DECIMAL32,
-    )
+    return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype])
 
 
 def dtype_from_pylibcudf_lists_column(col):
     child = col.list_view().child()
     tid = child.type().id()
 
-    if tid == pylibcudf.TypeId.LIST:
+    if tid == plc.TypeId.LIST:
         return cudf.ListDtype(dtype_from_pylibcudf_lists_column(child))
-    elif tid == pylibcudf.TypeId.EMPTY:
+    elif tid == plc.TypeId.EMPTY:
         return cudf.ListDtype("int8")
     else:
         return cudf.ListDtype(
@@ -274,26 +149,24 @@ def dtype_from_pylibcudf_column(col):
     type_ = col.type()
     tid = type_.id()
 
-    if tid == pylibcudf.TypeId.LIST:
+    if tid == plc.TypeId.LIST:
         return dtype_from_pylibcudf_lists_column(col)
-    elif tid == pylibcudf.TypeId.STRUCT:
+    elif tid == plc.TypeId.STRUCT:
         return dtype_from_pylibcudf_structs_column(col)
-    elif tid == pylibcudf.TypeId.DECIMAL64:
+    elif tid == plc.TypeId.DECIMAL64:
         return cudf.Decimal64Dtype(
             precision=cudf.Decimal64Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
-    elif tid == pylibcudf.TypeId.DECIMAL32:
+    elif tid == plc.TypeId.DECIMAL32:
         return cudf.Decimal32Dtype(
             precision=cudf.Decimal32Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
-    elif tid == pylibcudf.TypeId.DECIMAL128:
+    elif tid == plc.TypeId.DECIMAL128:
         return cudf.Decimal128Dtype(
             precision=cudf.Decimal128Dtype.MAX_PRECISION,
             scale=-type_.scale()
         )
     else:
-        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
-            <underlying_type_t_type_id>(tid)
-        ]
+        return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid]
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
deleted file mode 100644
index 6cc52d046af..00000000000
--- a/python/cudf/cudf/_lib/utils.pxd
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column_view
-from pylibcudf.libcudf.table.table cimport table, table_view
-
-
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=*)
-cpdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
-cpdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *)
-cdef data_from_table_view(
-    table_view tv, object owner, object column_names, object index_names=*)
-cdef table_view table_view_from_columns(columns) except *
-cdef table_view table_view_from_table(tbl, ignore_index=*) except*
-cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
-cdef columns_from_table_view(table_view tv, object owners)
-cpdef columns_from_pylibcudf_table(tbl)
-cpdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
deleted file mode 100644
index 6b3f10e1806..00000000000
--- a/python/cudf/cudf/_lib/utils.pyx
+++ /dev/null
@@ -1,400 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pyarrow as pa
-
-import cudf
-
-from cython.operator cimport dereference
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
-
-from pylibcudf.libcudf.column.column cimport column, column_view
-from pylibcudf.libcudf.table.table cimport table
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport size_type
-
-from cudf._lib.column cimport Column
-
-try:
-    import ujson as json
-except ImportError:
-    import json
-
-from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
-
-PARQUET_META_TYPE_MAP = {
-    str(cudf_dtype): str(pandas_dtype)
-    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
-}
-
-cdef table_view table_view_from_columns(columns) except*:
-    """Create a cudf::table_view from an iterable of Columns."""
-    cdef vector[column_view] column_views
-
-    cdef Column col
-    for col in columns:
-        column_views.push_back(col.view())
-
-    return table_view(column_views)
-
-
-cdef table_view table_view_from_table(tbl, ignore_index=False) except*:
-    """Create a cudf::table_view from a Table.
-
-    Parameters
-    ----------
-    ignore_index : bool, default False
-        If True, don't include the index in the columns.
-    """
-    return table_view_from_columns(
-        tbl._index._columns + tbl._columns
-        if not ignore_index and tbl._index is not None
-        else tbl._columns
-    )
-
-
-cpdef generate_pandas_metadata(table, index):
-    col_names = []
-    types = []
-    index_levels = []
-    index_descriptors = []
-    columns_to_convert = list(table._columns)
-    # Columns
-    for name, col in table._column_labels_and_values:
-        if cudf.get_option("mode.pandas_compatible"):
-            # in pandas-compat mode, non-string column names are stringified.
-            col_names.append(str(name))
-        else:
-            col_names.append(name)
-
-        if isinstance(col.dtype, cudf.CategoricalDtype):
-            raise ValueError(
-                "'category' column dtypes are currently not "
-                + "supported by the gpu accelerated parquet writer"
-            )
-        elif isinstance(col.dtype, (
-            cudf.ListDtype,
-            cudf.StructDtype,
-            cudf.core.dtypes.DecimalDtype
-        )):
-            types.append(col.dtype.to_arrow())
-        else:
-            # A boolean element takes 8 bits in cudf and 1 bit in
-            # pyarrow. To make sure the cudf format is interperable
-            # in arrow, we use `int8` type when converting from a
-            # cudf boolean array.
-            if col.dtype.type == np.bool_:
-                types.append(pa.int8())
-            else:
-                types.append(np_to_pa_dtype(col.dtype))
-
-    # Indexes
-    materialize_index = False
-    if index is not False:
-        for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.MultiIndex):
-                idx = table.index.get_level_values(level)
-            else:
-                idx = table.index
-
-            if isinstance(idx, cudf.RangeIndex):
-                if index is None:
-                    descr = {
-                        "kind": "range",
-                        "name": table.index.name,
-                        "start": table.index.start,
-                        "stop": table.index.stop,
-                        "step": table.index.step,
-                    }
-                else:
-                    materialize_index = True
-                    # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = idx._as_int_index()
-                    descr = _index_level_name(
-                        index_name=materialized_idx.name,
-                        level=level,
-                        column_names=col_names
-                    )
-                    index_levels.append(materialized_idx)
-                    columns_to_convert.append(materialized_idx._values)
-                    col_names.append(descr)
-                    types.append(np_to_pa_dtype(materialized_idx.dtype))
-            else:
-                descr = _index_level_name(
-                    index_name=idx.name,
-                    level=level,
-                    column_names=col_names
-                )
-                columns_to_convert.append(idx._values)
-                col_names.append(descr)
-                if isinstance(idx.dtype, cudf.CategoricalDtype):
-                    raise ValueError(
-                        "'category' column dtypes are currently not "
-                        + "supported by the gpu accelerated parquet writer"
-                    )
-                elif isinstance(idx.dtype, cudf.ListDtype):
-                    types.append(col.dtype.to_arrow())
-                else:
-                    # A boolean element takes 8 bits in cudf and 1 bit in
-                    # pyarrow. To make sure the cudf format is interperable
-                    # in arrow, we use `int8` type when converting from a
-                    # cudf boolean array.
-                    if idx.dtype.type == np.bool_:
-                        types.append(pa.int8())
-                    else:
-                        types.append(np_to_pa_dtype(idx.dtype))
-
-                index_levels.append(idx)
-            index_descriptors.append(descr)
-
-    df_meta = table.head(0)
-    if materialize_index:
-        df_meta.index = df_meta.index._as_int_index()
-    metadata = pa.pandas_compat.construct_metadata(
-        columns_to_convert=columns_to_convert,
-        # It is OKAY to do `.head(0).to_pandas()` because
-        # this method will extract `.columns` metadata only
-        df=df_meta.to_pandas(),
-        column_names=col_names,
-        index_levels=index_levels,
-        index_descriptors=index_descriptors,
-        preserve_index=index,
-        types=types,
-    )
-
-    md_dict = json.loads(metadata[b"pandas"])
-
-    # correct metadata for list and struct and nullable numeric types
-    for col_meta in md_dict["columns"]:
-        if (
-            col_meta["name"] in table._column_names
-            and table._data[col_meta["name"]].nullable
-            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
-            and col_meta["pandas_type"] != "decimal"
-        ):
-            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
-                col_meta["numpy_type"]
-            ]
-        if col_meta["numpy_type"] in ("list", "struct"):
-            col_meta["numpy_type"] = "object"
-
-    return json.dumps(md_dict)
-
-
-def _index_level_name(index_name, level, column_names):
-    """
-    Return the name of an index level or a default name
-    if `index_name` is None or is already a column name.
-
-    Parameters
-    ----------
-    index_name : name of an Index object
-    level : level of the Index object
-
-    Returns
-    -------
-    name : str
-    """
-    if index_name is not None and index_name not in column_names:
-        return index_name
-    else:
-        return f"__index_level_{level}__"
-
-
-cdef columns_from_unique_ptr(
-    unique_ptr[table] c_tbl
-):
-    """Convert a libcudf table into list of columns.
-
-    Parameters
-    ----------
-    c_tbl : unique_ptr[cudf::table]
-        The libcudf table whose columns will be extracted
-
-    Returns
-    -------
-    list[Column]
-        A list of columns.
-    """
-    cdef vector[unique_ptr[column]] c_columns = move(c_tbl.get().release())
-    cdef vector[unique_ptr[column]].iterator it = c_columns.begin()
-
-    cdef size_t i
-
-    columns = [Column.from_unique_ptr(move(dereference(it+i)))
-               for i in range(c_columns.size())]
-
-    return columns
-
-
-cpdef columns_from_pylibcudf_table(tbl):
-    """Convert a pylibcudf table into list of columns.
-
-    Parameters
-    ----------
-    tbl : pylibcudf.Table
-        The pylibcudf table whose columns will be extracted
-
-    Returns
-    -------
-    list[Column]
-        A list of columns.
-    """
-    return [Column.from_pylibcudf(plc) for plc in tbl.columns()]
-
-
-cpdef _data_from_columns(columns, column_names, index_names=None):
-    """Convert a list of columns into a dict with an index.
-
-    This method is intended to provide the bridge between the columns returned
-    from calls to libcudf or pylibcudf APIs and the cuDF Python Frame objects, which
-    require named columns and a separate index.
-
-    Since cuDF Python has an independent representation of a table as a
-    collection of columns, this function simply returns a dict of columns
-    suitable for conversion into data to be passed to cuDF constructors.
-    This method returns the columns of the table in the order they are
-    stored in libcudf, but calling code is responsible for partitioning and
-    labeling them as needed.
-
-    Parameters
-    ----------
-    columns : list[Column]
-        The columns to be extracted
-    column_names : iterable
-        The keys associated with the columns in the output data.
-    index_names : iterable, optional
-        If provided, an iterable of strings that will be used to label the
-        corresponding first set of columns into a (Multi)Index. If this
-        argument is omitted, all columns are assumed to be part of the output
-        table and no index is constructed.
-    """
-    # First construct the index, if any
-    index = (
-        # TODO: For performance, the _from_data methods of Frame types assume
-        # that the passed index object is already an Index because cudf.Index
-        # and cudf.as_index are expensive. As a result, this function is
-        # currently somewhat inconsistent in returning a dict of columns for
-        # the data while actually constructing the Index object here (instead
-        # of just returning a dict for that as well). As we clean up the
-        # Frame factories we may want to look for a less dissonant approach
-        # that does not impose performance penalties. The same applies to
-        # data_from_table_view below.
-        cudf.core.index._index_from_data(
-            {
-                name: columns[i]
-                for i, name in enumerate(index_names)
-            }
-        )
-        if index_names is not None
-        else None
-    )
-    n_index_columns = len(index_names) if index_names is not None else 0
-    data = {
-        name: columns[i + n_index_columns]
-        for i, name in enumerate(column_names)
-    }
-    return data, index
-
-
-cdef data_from_unique_ptr(
-    unique_ptr[table] c_tbl, column_names, index_names=None
-):
-    return _data_from_columns(
-        columns_from_unique_ptr(move(c_tbl)),
-        column_names,
-        index_names
-    )
-
-
-cpdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
-    return _data_from_columns(
-        columns_from_pylibcudf_table(tbl),
-        column_names,
-        index_names
-    )
-
-cpdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None):
-    """
-    Unpacks the TableWithMetadata from libcudf I/O
-    into a dict of columns and an Index (cuDF format)
-    """
-    if column_names is None:
-        column_names = tbl_with_meta.column_names(include_children=False)
-    return _data_from_columns(
-        columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=column_names,
-        index_names=index_names
-    )
-
-cdef columns_from_table_view(
-    table_view tv,
-    object owners,
-):
-    """
-    Given a ``cudf::table_view``, constructs a list of columns from it,
-    along with referencing an owner Python object that owns the memory
-    lifetime. owner must be either None or a list of column. If owner
-    is a list of columns, the owner of the `i`th ``cudf::column_view``
-    in the table view is ``owners[i]``. For more about memory ownership,
-    see ``Column.from_column_view``.
-    """
-
-    return [
-        Column.from_column_view(
-            tv.column(i), owners[i] if isinstance(owners, list) else None
-        ) for i in range(tv.num_columns())
-    ]
-
-cdef data_from_table_view(
-    table_view tv,
-    object owner,
-    object column_names,
-    object index_names=None
-):
-    """
-    Given a ``cudf::table_view``, constructs a Frame from it,
-    along with referencing an ``owner`` Python object that owns the memory
-    lifetime. If ``owner`` is a Frame we reach inside of it and
-    reach inside of each ``cudf.Column`` to make the owner of each newly
-    created ``Buffer`` underneath the ``cudf.Column`` objects of the
-    created Frame the respective ``Buffer`` from the relevant
-    ``cudf.Column`` of the ``owner`` Frame
-    """
-    cdef size_type column_idx = 0
-    table_owner = isinstance(owner, cudf.core.frame.Frame)
-
-    # First construct the index, if any
-    index = None
-    if index_names is not None:
-        index_columns = []
-        for _ in index_names:
-            column_owner = owner
-            if table_owner:
-                column_owner = owner._index._columns[column_idx]
-            index_columns.append(
-                Column.from_column_view(
-                    tv.column(column_idx),
-                    column_owner
-                )
-            )
-            column_idx += 1
-        index = cudf.core.index._index_from_data(
-            dict(zip(index_names, index_columns)))
-
-    # Construct the data dict
-    cdef size_type source_column_idx = 0
-    data_columns = []
-    for _ in column_names:
-        column_owner = owner
-        if table_owner:
-            column_owner = owner._columns[source_column_idx]
-        data_columns.append(
-            Column.from_column_view(tv.column(column_idx), column_owner)
-        )
-        column_idx += 1
-        source_column_idx += 1
-
-    return dict(zip(column_names, data_columns)), index
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 9c436dfad18..cad4b1aa72c 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 """Define common type operations."""
 
@@ -13,6 +13,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pandas.api import types as pd_types
 
 import cudf
@@ -144,6 +145,7 @@ def is_scalar(val):
             cudf.Scalar,
             cudf._lib.scalar.DeviceScalar,
             cudf.core.tools.datetimes.DateOffset,
+            pa.Scalar,
         ),
     ) or (
         pd_types.is_scalar(val)
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 2df154ee112..2806a1f6c23 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1,8 +1,7 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Literal
@@ -11,17 +10,18 @@
 from typing_extensions import Self
 
 import cudf
-from cudf._lib.copying import _gather_map_is_valid, gather
-from cudf._lib.stream_compaction import (
+from cudf._lib.types import size_type_dtype
+from cudf.api.extensions import no_default
+from cudf.api.types import is_integer, is_list_like, is_scalar
+from cudf.core._internals import copying
+from cudf.core._internals.stream_compaction import (
     apply_boolean_mask,
     drop_duplicates,
     drop_nulls,
 )
-from cudf._lib.types import size_type_dtype
-from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
+from cudf.core.copy_types import GatherMap
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
 from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
@@ -330,13 +330,6 @@ def get_level_values(self, level):
         else:
             raise KeyError(f"Requested level with name {level} " "not found")
 
-    @classmethod
-    def deserialize(cls, header, frames):
-        # Dispatch deserialization to the appropriate index type in case
-        # deserialization is ever attempted with the base class directly.
-        idx_type = pickle.loads(header["type-serialized"])
-        return idx_type.deserialize(header, frames)
-
     @property
     def names(self):
         """
@@ -357,7 +350,7 @@ def names(self, values):
 
         self.name = values[0]
 
-    def _clean_nulls_from_index(self):
+    def _pandas_repr_compatible(self):
         """
         Convert all na values(if any) in Index object
         to `<NA>` as a preprocessing step to `__repr__` methods.
@@ -421,7 +414,7 @@ def hasnans(self):
         raise NotImplementedError
 
     @property
-    def nlevels(self):
+    def nlevels(self) -> int:
         """
         Number of levels.
         """
@@ -1454,7 +1447,7 @@ def _union(self, other, sort=None):
         other_df["order"] = other_df.index
         res = self_df.merge(other_df, on=[0], how="outer")
         res = res.sort_values(
-            by=res._data.to_pandas_index()[1:], ignore_index=True
+            by=res._data.to_pandas_index[1:], ignore_index=True
         )
         union_result = cudf.core.index._index_from_data({0: res._data[0]})
 
@@ -1951,7 +1944,6 @@ def drop_duplicates(
         return self._from_columns_like_self(
             drop_duplicates(
                 list(self._columns),
-                keys=range(len(self._columns)),
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
@@ -2040,7 +2032,6 @@ def dropna(self, how="any"):
             drop_nulls(
                 data_columns,
                 how=how,
-                keys=range(len(data_columns)),
             ),
             self._column_names,
         )
@@ -2058,13 +2049,9 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
         if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
-        if not _gather_map_is_valid(
-            gather_map, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
-
+        GatherMap(gather_map, len(self), nullify=not check_bounds or nullify)
         return self._from_columns_like_self(
-            gather(list(self._columns), gather_map, nullify=nullify),
+            copying.gather(self._columns, gather_map, nullify=nullify),
             self._column_names,
         )
 
diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py
index fe8ea5a947a..1d21d34b1bf 100644
--- a/python/cudf/cudf/core/_internals/aggregation.py
+++ b/python/cudf/cudf/core/_internals/aggregation.py
@@ -29,11 +29,11 @@
 
 class Aggregation:
     def __init__(self, agg: plc.aggregation.Aggregation) -> None:
-        self.c_obj = agg
+        self.plc_obj = agg
 
     @property
     def kind(self) -> str:
-        name = self.c_obj.kind().name
+        name = self.plc_obj.kind().name
         return _agg_name_map.get(name, name)
 
     @classmethod
diff --git a/python/cudf/cudf/core/_internals/copying.py b/python/cudf/cudf/core/_internals/copying.py
new file mode 100644
index 00000000000..34c1850cb72
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/copying.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import pylibcudf as plc
+
+import cudf
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+
+
+@acquire_spill_lock()
+def gather(
+    columns: Iterable[ColumnBase],
+    gather_map: NumericalColumn,
+    nullify: bool = False,
+) -> list[ColumnBase]:
+    plc_tbl = plc.copying.gather(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        gather_map.to_pylibcudf(mode="read"),
+        plc.copying.OutOfBoundsPolicy.NULLIFY
+        if nullify
+        else plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+    )
+    return [
+        cudf._lib.column.Column.from_pylibcudf(col)
+        for col in plc_tbl.columns()
+    ]
+
+
+@acquire_spill_lock()
+def scatter(
+    sources: list[ColumnBase | cudf.Scalar],
+    scatter_map: NumericalColumn,
+    target_columns: list[ColumnBase],
+    bounds_check: bool = True,
+):
+    """
+    Scattering source into target as per the scatter map.
+    `source` can be a list of scalars, or a list of columns. The number of
+    items in `sources` must equal the number of `target_columns` to scatter.
+    """
+    # TODO: Only single column scatter is used, we should explore multi-column
+    # scatter for frames for performance increase.
+
+    if len(sources) != len(target_columns):
+        raise ValueError("Mismatched number of source and target columns.")
+
+    if len(sources) == 0:
+        return []
+
+    if bounds_check:
+        n_rows = len(target_columns[0])
+        if not (
+            (scatter_map >= -n_rows).all() and (scatter_map < n_rows).all()
+        ):
+            raise IndexError(
+                f"index out of bounds for column of size {n_rows}"
+            )
+
+    plc_tbl = plc.copying.scatter(
+        plc.Table([col.to_pylibcudf(mode="read") for col in sources])  # type: ignore[union-attr]
+        if isinstance(sources[0], cudf._lib.column.Column)
+        else [slr.device_value.c_value for slr in sources],  # type: ignore[union-attr]
+        scatter_map.to_pylibcudf(mode="read"),
+        plc.Table([col.to_pylibcudf(mode="read") for col in target_columns]),
+    )
+
+    return [
+        cudf._lib.column.Column.from_pylibcudf(col)
+        for col in plc_tbl.columns()
+    ]
+
+
+@acquire_spill_lock()
+def columns_split(
+    input_columns: Iterable[ColumnBase], splits: list[int]
+) -> list[list[ColumnBase]]:
+    return [
+        [
+            cudf._lib.column.Column.from_pylibcudf(col)
+            for col in plc_tbl.columns()
+        ]
+        for plc_tbl in plc.copying.split(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in input_columns]
+            ),
+            splits,
+        )
+    ]
diff --git a/python/cudf/cudf/core/_internals/sorting.py b/python/cudf/cudf/core/_internals/sorting.py
new file mode 100644
index 00000000000..69f9e7664b1
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/sorting.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+import itertools
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from collections.abc import Iterable
+
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def is_sorted(
+    source_columns: list[ColumnBase],
+    ascending: list[bool] | None = None,
+    null_position: list[bool] | None = None,
+) -> bool:
+    """
+    Checks whether the rows of a `table` are sorted in lexicographical order.
+
+    Parameters
+    ----------
+    source_columns : list of columns
+        columns to be checked for sort order
+    ascending : None or list-like of booleans
+        None or list-like of boolean values indicating expected sort order of
+        each column. If list-like, size of list-like must be len(columns). If
+        None, all columns expected sort order is set to ascending. False (0) -
+        descending, True (1) - ascending.
+    null_position : None or list-like of booleans
+        None or list-like of boolean values indicating desired order of nulls
+        compared to other elements. If list-like, size of list-like must be
+        len(columns). If None, null order is set to before. False (0) - after,
+        True (1) - before.
+
+    Returns
+    -------
+    returns : boolean
+        Returns True, if sorted as expected by ``ascending`` and
+        ``null_position``, False otherwise.
+    """
+    if ascending is None:
+        column_order = [plc.types.Order.ASCENDING] * len(source_columns)
+    else:
+        if len(ascending) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(ascending)} for `ascending`"
+            )
+        column_order = [
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+            for asc in ascending
+        ]
+
+    if null_position is None:
+        null_precedence = [plc.types.NullOrder.AFTER] * len(source_columns)
+    else:
+        if len(null_position) != len(source_columns):
+            raise ValueError(
+                f"Expected a list-like of length {len(source_columns)}, "
+                f"got length {len(null_position)} for `null_position`"
+            )
+        null_precedence = [
+            plc.types.NullOrder.BEFORE if null else plc.types.NullOrder.AFTER
+            for null in null_position
+        ]
+
+    return plc.sorting.is_sorted(
+        plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]),
+        column_order,
+        null_precedence,
+    )
+
+
+def ordering(
+    column_order: list[bool],
+    null_precedence: Iterable[Literal["first", "last"]],
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Construct order and null order vectors
+
+    Parameters
+    ----------
+    column_order
+        Iterable of bool (True for ascending order, False for descending)
+    null_precedence
+        Iterable string for null positions ("first" for start, "last" for end)
+
+    Both iterables must be the same length (not checked)
+
+    Returns
+    -------
+    pair of vectors (order, and null_order)
+    """
+    c_column_order = []
+    c_null_precedence = []
+    for asc, null in zip(column_order, null_precedence):
+        c_column_order.append(
+            plc.types.Order.ASCENDING if asc else plc.types.Order.DESCENDING
+        )
+        if asc ^ (null == "first"):
+            c_null_precedence.append(plc.types.NullOrder.AFTER)
+        elif asc ^ (null == "last"):
+            c_null_precedence.append(plc.types.NullOrder.BEFORE)
+        else:
+            raise ValueError(f"Invalid null precedence {null}")
+    return c_column_order, c_null_precedence
+
+
+@acquire_spill_lock()
+def order_by(
+    columns_from_table: list[ColumnBase],
+    ascending: list[bool],
+    na_position: Literal["first", "last"],
+    *,
+    stable: bool,
+):
+    """
+    Get index to sort the table in ascending/descending order.
+
+    Parameters
+    ----------
+    columns_from_table : list[Column]
+        Columns from the table which will be sorted
+    ascending : sequence[bool]
+         Sequence of boolean values which correspond to each column
+         in the table to be sorted signifying the order of each column
+         True - Ascending and False - Descending
+    na_position : str
+        Whether null values should show up at the "first" or "last"
+        position of **all** sorted column.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    Column of indices that sorts the table
+    """
+    order = ordering(ascending, itertools.repeat(na_position))
+    func = (
+        plc.sorting.stable_sorted_order if stable else plc.sorting.sorted_order
+    )
+    return Column.from_pylibcudf(
+        func(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns_from_table],
+            ),
+            order[0],
+            order[1],
+        )
+    )
+
+
+@acquire_spill_lock()
+def sort_by_key(
+    values: list[ColumnBase],
+    keys: list[ColumnBase],
+    ascending: list[bool],
+    na_position: list[Literal["first", "last"]],
+    *,
+    stable: bool,
+) -> list[ColumnBase]:
+    """
+    Sort a table by given keys
+
+    Parameters
+    ----------
+    values : list[Column]
+        Columns of the table which will be sorted
+    keys : list[Column]
+        Columns making up the sort key
+    ascending : list[bool]
+        Sequence of boolean values which correspond to each column
+        in the table to be sorted signifying the order of each column
+        True - Ascending and False - Descending
+    na_position : list[str]
+        Sequence of "first" or "last" values (default "first")
+        indicating the position of null values when sorting the keys.
+    stable : bool
+        Should the sort be stable? (no default)
+
+    Returns
+    -------
+    list[Column]
+        list of value columns sorted by keys
+    """
+    order = ordering(ascending, na_position)
+    func = (
+        plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+    )
+    return [
+        Column.from_pylibcudf(col)
+        for col in func(
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            plc.Table([col.to_pylibcudf(mode="read") for col in keys]),
+            order[0],
+            order[1],
+        ).columns()
+    ]
diff --git a/python/cudf/cudf/core/_internals/stream_compaction.py b/python/cudf/cudf/core/_internals/stream_compaction.py
new file mode 100644
index 00000000000..4ccc26c2a1c
--- /dev/null
+++ b/python/cudf/cudf/core/_internals/stream_compaction.py
@@ -0,0 +1,121 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
+import pylibcudf as plc
+
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
+
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
+
+@acquire_spill_lock()
+def drop_nulls(
+    columns: list[ColumnBase],
+    how: Literal["any", "all"] = "any",
+    keys: list[int] | None = None,
+    thresh: int | None = None,
+) -> list[ColumnBase]:
+    """
+    Drops null rows from cols depending on key columns.
+
+    Parameters
+    ----------
+    columns : list of columns
+    how  : "any" or "all". If thresh is None, drops rows of cols that have any
+           nulls or all nulls (respectively) in subset (default: "any")
+    keys : List of column indices. If set, then these columns are checked for
+           nulls rather than all of columns (optional)
+    thresh : Minimum number of non-nulls required to keep a row (optional)
+
+    Returns
+    -------
+    columns with null rows dropped
+    """
+    if how not in {"any", "all"}:
+        raise ValueError("how must be 'any' or 'all'")
+
+    keys = keys if keys is not None else list(range(len(columns)))
+
+    # Note: If how == "all" and thresh is specified this prioritizes thresh
+    if thresh is not None:
+        keep_threshold = thresh
+    elif how == "all":
+        keep_threshold = 1
+    else:
+        keep_threshold = len(keys)
+
+    plc_table = plc.stream_compaction.drop_nulls(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        keys,
+        keep_threshold,
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
+
+
+@acquire_spill_lock()
+def apply_boolean_mask(
+    columns: list[ColumnBase], boolean_mask: ColumnBase
+) -> list[ColumnBase]:
+    """
+    Drops the rows which correspond to False in boolean_mask.
+
+    Parameters
+    ----------
+    columns : list of columns whose rows are dropped as per boolean_mask
+    boolean_mask : a boolean column of same size as source_table
+
+    Returns
+    -------
+    columns obtained from applying mask
+    """
+    plc_table = plc.stream_compaction.apply_boolean_mask(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        boolean_mask.to_pylibcudf(mode="read"),
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
+
+
+@acquire_spill_lock()
+def drop_duplicates(
+    columns: list[ColumnBase],
+    keys: list[int] | None = None,
+    keep: Literal["first", "last", False] = "first",
+    nulls_are_equal: bool = True,
+) -> list[ColumnBase]:
+    """
+    Drops rows in source_table as per duplicate rows in keys.
+
+    Parameters
+    ----------
+    columns : List of columns
+    keys : List of column indices. If set, then these columns are checked for
+           duplicates rather than all of columns (optional)
+    keep : keep 'first' or 'last' or none of the duplicate rows
+    nulls_are_equal : if True, nulls are treated equal else not.
+
+    Returns
+    -------
+    columns with duplicate dropped
+    """
+    _keep_options = {
+        "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+        "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+        False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+    }
+    if (keep_option := _keep_options.get(keep)) is None:
+        raise ValueError('keep must be either "first", "last" or False')
+
+    plc_table = plc.stream_compaction.stable_distinct(
+        plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+        keys if keys is not None else list(range(len(columns))),
+        keep_option,
+        plc.types.NullEquality.EQUAL
+        if nulls_are_equal
+        else plc.types.NullEquality.UNEQUAL,
+        plc.types.NanEquality.ALL_EQUAL,
+    )
+    return [Column.from_pylibcudf(col) for col in plc_table.columns()]
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index ce6bb83bc77..c8ea03b04fe 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,8 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 """Common abstract base classes for cudf."""
 
-import pickle
-
 import numpy
 
 import cudf
@@ -22,6 +20,14 @@ class Serializable:
     latter converts back from that representation into an equivalent object.
     """
 
+    # A mapping from class names to the classes themselves. This is used to
+    # reconstruct the correct class when deserializing an object.
+    _name_type_map: dict = {}
+
+    def __init_subclass__(cls, /, **kwargs):
+        super().__init_subclass__(**kwargs)
+        cls._name_type_map[cls.__name__] = cls
+
     def serialize(self):
         """Generate an equivalent serializable representation of an object.
 
@@ -98,7 +104,7 @@ def device_serialize(self):
             )
             for f in frames
         )
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["type-serialized-name"] = type(self).__name__
         header["is-cuda"] = [
             hasattr(f, "__cuda_array_interface__") for f in frames
         ]
@@ -128,10 +134,10 @@ def device_deserialize(cls, header, frames):
 
         :meta private:
         """
-        typ = pickle.loads(header["type-serialized"])
+        typ = cls._name_type_map[header["type-serialized-name"]]
         frames = [
             cudf.core.buffer.as_buffer(f) if c else memoryview(f)
-            for c, f in zip(header["is-cuda"], frames)
+            for c, f in zip(header["is-cuda"], frames, strict=True)
         ]
         return typ.deserialize(header, frames)
 
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index ffa306bf93f..625938ca168 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import math
-import pickle
 import weakref
 from types import SimpleNamespace
 from typing import TYPE_CHECKING, Any, Literal
@@ -432,8 +431,7 @@ def serialize(self) -> tuple[dict, list]:
             second element is a list containing single frame.
         """
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+        header["owner-type-serialized-name"] = type(self._owner).__name__
         header["frame_count"] = 1
         frames = [self]
         return header, frames
@@ -460,7 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
         if isinstance(frame, cls):
             return frame  # The frame is already deserialized
 
-        owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
+        owner_type: BufferOwner = Serializable._name_type_map[
+            header["owner-type-serialized-name"]
+        ]
         if hasattr(frame, "__cuda_array_interface__"):
             owner = owner_type.from_device_memory(frame, exposed=False)
         else:
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 7305ff651c6..cbb65229933 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import collections.abc
-import pickle
 import time
 import weakref
 from threading import RLock
@@ -415,8 +414,7 @@ def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
         frames: list[Buffer | memoryview]
         with self._owner.lock:
-            header["type-serialized"] = pickle.dumps(self.__class__)
-            header["owner-type-serialized"] = pickle.dumps(type(self._owner))
+            header["owner-type-serialized-name"] = type(self._owner).__name__
             header["frame_count"] = 1
             if self.is_spilled:
                 frames = [self.memoryview()]
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 8d38a5f2272..0fe47255368 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,13 +1,10 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.byte_pair_encode import (
-    byte_pair_encoding as cpp_byte_pair_encoding,
-)
 
 
 class BytePairEncoder:
@@ -25,12 +22,12 @@ class BytePairEncoder:
     BytePairEncoder
     """
 
-    def __init__(self, merges_pair: "cudf.Series"):
+    def __init__(self, merges_pair: cudf.Series) -> None:
         self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs(
             merges_pair._column.to_pylibcudf(mode="read")
         )
 
-    def __call__(self, text, separator: str = " ") -> cudf.Series:
+    def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -56,7 +53,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series:
         1             this is it
         dtype: object
         """
-        sep = cudf.Scalar(separator, dtype="str")
-        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
-
-        return cudf.Series._from_column(result)
+        return cudf.Series._from_column(
+            text._column.byte_pair_encoding(self.merge_pairs, separator)
+        )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c849a9d3d2b..d705b4d4c21 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -13,7 +13,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.transform import bools_to_mask
 from cudf.core._internals import unary
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
@@ -622,7 +621,7 @@ def ordered(self) -> bool:
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
-        ) and cudf._lib.scalar._is_null_host_scalar(value):
+        ) and cudf.utils.utils._is_null_host_scalar(value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
@@ -775,12 +774,11 @@ def to_pandas(
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         if self.categories.dtype.kind == "f":
-            new_mask = bools_to_mask(self.notnull())
             col = type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
                 dtype=self.dtype,
-                mask=new_mask,
+                mask=self.notnull().fillna(False).as_mask(),
                 children=self.children,
             )
         else:
@@ -1097,17 +1095,22 @@ def as_categorical_column(self, dtype: Dtype) -> Self:
             raise ValueError("dtype must be CategoricalDtype")
 
         if not isinstance(self.categories, type(dtype.categories._column)):
-            # If both categories are of different Column types,
-            # return a column full of Nulls.
-            codes = cast(
-                cudf.core.column.numerical.NumericalColumn,
-                column.as_column(
-                    _DEFAULT_CATEGORICAL_VALUE,
-                    length=self.size,
-                    dtype=self.codes.dtype,
-                ),
-            )
-            codes = as_unsigned_codes(len(dtype.categories), codes)
+            if isinstance(
+                self.categories.dtype, cudf.StructDtype
+            ) and isinstance(dtype.categories.dtype, cudf.IntervalDtype):
+                codes = self.codes
+            else:
+                # Otherwise if both categories are of different Column types,
+                # return a column full of nulls.
+                codes = cast(
+                    cudf.core.column.numerical.NumericalColumn,
+                    column.as_column(
+                        _DEFAULT_CATEGORICAL_VALUE,
+                        length=self.size,
+                        dtype=self.codes.dtype,
+                    ),
+                )
+                codes = as_unsigned_codes(len(dtype.categories), codes)
             return type(self)(
                 data=self.data,  # type: ignore[arg-type]
                 size=self.size,
@@ -1189,13 +1192,13 @@ def _concat(
         codes = [o.codes for o in objs]
 
         newsize = sum(map(len, codes))
-        if newsize > libcudf.MAX_COLUMN_SIZE:
+        if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
             raise MemoryError(
                 f"Result of concat cannot have "
-                f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+                f"size > {libcudf.types.size_type_dtype}_MAX"
             )
         elif newsize == 0:
-            codes_col = column.column_empty(0, head.codes.dtype, masked=True)
+            codes_col = column.column_empty(0, head.codes.dtype)
         else:
             codes_col = column.concat_columns(codes)  # type: ignore[arg-type]
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1ddc79e8970..e23ca810065 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
-import pickle
+import warnings
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -25,15 +25,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.scalar import as_device_scalar
-from cudf._lib.stream_compaction import (
-    apply_boolean_mask,
-    distinct_count as cpp_distinct_count,
-    drop_duplicates,
-    drop_nulls,
-)
-from cudf._lib.transform import bools_to_mask
-from cudf._lib.types import size_type_dtype
+from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -43,7 +35,12 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
-from cudf.core._internals import unary
+from cudf.core._internals import aggregation, copying, sorting, unary
+from cudf.core._internals.stream_compaction import (
+    apply_boolean_mask,
+    drop_duplicates,
+    drop_nulls,
+)
 from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
@@ -52,6 +49,7 @@
     as_buffer,
     cuda_array_interface_wrapper,
 )
+from cudf.core.copy_types import GatherMap
 from cudf.core.dtypes import (
     CategoricalDtype,
     DecimalDtype,
@@ -72,12 +70,14 @@
     min_signed_type,
     min_unsigned_type,
 )
-from cudf.utils.utils import _array_ufunc, mask_dtype
+from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype
 
 if TYPE_CHECKING:
     import builtins
 
     from cudf._typing import ColumnLike, Dtype, ScalarLike
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.strings import StringColumn
 
 if PANDAS_GE_210:
     NumpyExtensionArray = pd.arrays.NumpyExtensionArray
@@ -93,6 +93,8 @@ class ColumnBase(Column, Serializable, BinaryOperand, Reducible):
         "min",
     }
 
+    _PANDAS_NA_REPR = str(pd.NA)
+
     def data_array_view(
         self, *, mode: Literal["write", "read"] = "write"
     ) -> "cuda.devicearray.DeviceNDArray":
@@ -177,6 +179,17 @@ def __repr__(self):
             f"dtype: {self.dtype}"
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        if self.has_nulls():
+            return self.astype("str").fillna(self._PANDAS_NA_REPR)
+        return self
+
     def to_pandas(
         self,
         *,
@@ -240,8 +253,12 @@ def find_and_replace(
     def clip(self, lo: ScalarLike, hi: ScalarLike) -> Self:
         plc_column = plc.replace.clamp(
             self.to_pylibcudf(mode="read"),
-            cudf.Scalar(lo, self.dtype).device_value.c_value,
-            cudf.Scalar(hi, self.dtype).device_value.c_value,
+            plc.interop.from_arrow(
+                pa.scalar(lo, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
+            plc.interop.from_arrow(
+                pa.scalar(hi, type=cudf_dtype_to_pa_type(self.dtype))
+            ),
         )
         return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
@@ -261,28 +278,25 @@ def all(self, skipna: bool = True) -> bool:
         # The skipna argument is only used for numerical columns.
         # If all entries are null the result is True, including when the column
         # is empty.
-
         if self.null_count == self.size:
             return True
-
-        return libcudf.reduce.reduce("all", self)
+        return self.reduce("all")
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
-
         if not skipna and self.has_nulls():
             return True
         elif skipna and self.null_count == self.size:
             return False
-
-        return libcudf.reduce.reduce("any", self)
+        return self.reduce("any")
 
     def dropna(self) -> Self:
         if self.has_nulls():
-            return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+            return drop_nulls([self])[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             return self.copy()
 
+    @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
@@ -299,9 +313,7 @@ def to_arrow(self) -> pa.Array:
           4
         ]
         """
-        return libcudf.interop.to_arrow([self], [("None", self.dtype)])[
-            "None"
-        ].chunk(0)
+        return plc.interop.to_arrow(self.to_pylibcudf(mode="read")).chunk(0)
 
     @classmethod
     def from_arrow(cls, array: pa.Array) -> ColumnBase:
@@ -338,26 +350,33 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.indices for chunk in data["None"].chunks],
+                [
+                    pa.chunked_array(
+                        [chunk.indices for chunk in data.column(0).chunks],
                         type=array.type.index_type,
                     )
-                }
+                ],
+                [None],
             )
             dictionaries_table = pa.table(
-                {
-                    "None": pa.chunked_array(
-                        [chunk.dictionary for chunk in data["None"].chunks],
+                [
+                    pa.chunked_array(
+                        [chunk.dictionary for chunk in data.column(0).chunks],
                         type=array.type.value_type,
                     )
-                }
+                ],
+                [None],
             )
-
-            codes = libcudf.interop.from_arrow(indices_table)[0]
-            categories = libcudf.interop.from_arrow(dictionaries_table)[0]
+            with acquire_spill_lock():
+                codes = cls.from_pylibcudf(
+                    plc.interop.from_arrow(indices_table).columns()[0]
+                )
+                categories = cls.from_pylibcudf(
+                    plc.interop.from_arrow(dictionaries_table).columns()[0]
+                )
             codes = cudf.core.column.categorical.as_unsigned_codes(
-                len(categories), codes
+                len(categories),
+                codes,  # type: ignore[arg-type]
             )
             return cudf.core.column.CategoricalColumn(
                 data=None,
@@ -368,15 +387,23 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 mask=codes.base_mask,
                 children=(codes,),
             )
+        else:
+            result = cls.from_pylibcudf(
+                plc.interop.from_arrow(data).columns()[0]
+            )
+            # TODO: cudf_dtype_from_pa_type may be less necessary for some types
+            return result._with_type_metadata(
+                cudf_dtype_from_pa_type(array.type)
+            )
 
-        result = libcudf.interop.from_arrow(data)[0]
-
-        return result._with_type_metadata(cudf_dtype_from_pa_type(array.type))
-
+    @acquire_spill_lock()
     def _get_mask_as_column(self) -> ColumnBase:
-        return libcudf.transform.mask_to_bools(
-            self.base_mask, self.offset, self.offset + len(self)
+        plc_column = plc.transform.mask_to_bools(
+            self.base_mask.get_ptr(mode="read"),  # type: ignore[union-attr]
+            self.offset,
+            self.offset + len(self),
         )
+        return type(self).from_pylibcudf(plc_column)
 
     @cached_property
     def memory_usage(self) -> int:
@@ -432,8 +459,16 @@ def _fill(
             )
         return self
 
-    def shift(self, offset: int, fill_value: ScalarLike) -> ColumnBase:
-        return libcudf.copying.shift(self, offset, fill_value)
+    @acquire_spill_lock()
+    def shift(self, offset: int, fill_value: ScalarLike) -> Self:
+        if not isinstance(fill_value, cudf.Scalar):
+            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
+        plc_col = plc.copying.shift(
+            self.to_pylibcudf(mode="read"),
+            offset,
+            fill_value.device_value.c_value,
+        )
+        return type(self).from_pylibcudf(plc_col)  # type: ignore[return-value]
 
     @property
     def nullmask(self) -> Buffer:
@@ -461,8 +496,11 @@ def copy(self, deep: bool = True) -> Self:
             them.
         """
         if deep:
-            result = libcudf.copying.copy_column(self)
-            return result._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                result = type(self).from_pylibcudf(
+                    self.to_pylibcudf(mode="read").copy()
+                )
+            return result._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             return cast(
                 Self,
@@ -543,7 +581,15 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        return libcudf.copying.get_element(self, idx).value
+        with acquire_spill_lock():
+            dscalar = libcudf.scalar.DeviceScalar.from_pylibcudf(
+                plc.copying.get_element(
+                    self.to_pylibcudf(mode="read"),
+                    idx,
+                ),
+                dtype=self.dtype,
+            )
+        return dscalar.value
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         stride = 1 if stride is None else stride
@@ -552,12 +598,18 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         if stop < 0 and not (stride < 0 and stop == -1):
             stop = stop + len(self)
         if (stride > 0 and start >= stop) or (stride < 0 and start <= stop):
-            return cast(Self, column_empty(0, self.dtype, masked=True))
+            return cast(Self, column_empty(0, self.dtype))
         # compute mask slice
         if stride == 1:
-            return libcudf.copying.column_slice(self, [start, stop])[
-                0
-            ]._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                result = [
+                    type(self).from_pylibcudf(col)
+                    for col in plc.copying.slice(
+                        self.to_pylibcudf(mode="read"),
+                        [start, stop],
+                    )
+                ]
+            return result[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
         else:
             # Need to create a gather map for given slice with stride
             gather_map = as_column(
@@ -626,9 +678,16 @@ def _scatter_by_slice(
             if isinstance(value, cudf.core.scalar.Scalar):
                 return self._fill(value, start, stop, inplace=True)
             else:
-                return libcudf.copying.copy_range(
-                    value, self, 0, num_keys, start, stop, False
-                )
+                with acquire_spill_lock():
+                    return type(self).from_pylibcudf(  # type: ignore[return-value]
+                        plc.copying.copy_range(
+                            value.to_pylibcudf(mode="read"),
+                            self.to_pylibcudf(mode="read"),
+                            0,
+                            num_keys,
+                            start,
+                        )
+                    )
 
         # step != 1, create a scatter map with arange
         scatter_map = cast(
@@ -672,11 +731,21 @@ def _scatter_by_column(
         self._check_scatter_key_length(num_keys, value)
 
         if key.dtype.kind == "b":
-            return libcudf.copying.boolean_mask_scatter([value], [self], key)[
-                0
-            ]._with_type_metadata(self.dtype)
+            with acquire_spill_lock():
+                plc_table = plc.copying.boolean_mask_scatter(
+                    plc.Table([value.to_pylibcudf(mode="read")])
+                    if isinstance(value, Column)
+                    else [value.device_value.c_value],
+                    plc.Table([self.to_pylibcudf(mode="read")]),
+                    key.to_pylibcudf(mode="read"),
+                )
+                return (
+                    type(self)  # type: ignore[return-value]
+                    .from_pylibcudf(plc_table.columns()[0])
+                    ._with_type_metadata(self.dtype)
+                )
         else:
-            return libcudf.copying.scatter([value], key, [self])[
+            return copying.scatter([value], key, [self])[
                 0
             ]._with_type_metadata(self.dtype)
 
@@ -725,9 +794,7 @@ def fillna(
         if not self.has_nulls(include_nan=True):
             return self.copy()
         elif method is None:
-            if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
-                fill_value
-            ):
+            if is_scalar(fill_value) and _is_null_host_scalar(fill_value):
                 return self.copy()
             else:
                 fill_value = self._validate_fillna_value(fill_value)
@@ -806,7 +873,7 @@ def indices_of(
         else:
             value = as_column(value, dtype=self.dtype, length=1)
         mask = value.contains(self)
-        return apply_boolean_mask(
+        return apply_boolean_mask(  # type: ignore[return-value]
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
 
@@ -888,14 +955,9 @@ def take(
         # be done by the caller. This check will be removed in future release.
         if indices.dtype.kind not in {"u", "i"}:
             indices = indices.astype(libcudf.types.size_type_dtype)
-        if not libcudf.copying._gather_map_is_valid(
-            indices, len(self), check_bounds, nullify
-        ):
-            raise IndexError("Gather map index is out of bounds.")
-
-        return libcudf.copying.gather([self], indices, nullify=nullify)[
-            0
-        ]._with_type_metadata(self.dtype)
+        GatherMap(indices, len(self), nullify=not check_bounds or nullify)
+        gathered = copying.gather([self], indices, nullify=nullify)  # type: ignore[arg-type]
+        return gathered[0]._with_type_metadata(self.dtype)  # type: ignore[return-value]
 
     def isin(self, values: Sequence) -> ColumnBase:
         """Check whether values are contained in the Column.
@@ -971,7 +1033,7 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
             # https://github.com/rapidsai/cudf/issues/14515 by
             # providing a mode in which cudf::contains does not mask
             # the result.
-            result = result.fillna(cudf.Scalar(rhs.null_count > 0))
+            result = result.fillna(rhs.null_count > 0)
         return result
 
     def as_mask(self) -> Buffer:
@@ -981,11 +1043,14 @@ def as_mask(self) -> Buffer:
         -------
         Buffer
         """
-
         if self.has_nulls():
             raise ValueError("Column must have no nulls.")
 
-        return bools_to_mask(self)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.bools_to_mask(
+                self.to_pylibcudf(mode="read")
+            )
+            return as_buffer(mask)
 
     @property
     def is_unique(self) -> bool:
@@ -994,13 +1059,13 @@ def is_unique(self) -> bool:
 
     @cached_property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [True], None
         )
 
     @cached_property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and sorting.is_sorted(
             [self], [False], None
         )
 
@@ -1024,23 +1089,34 @@ def contains(self, other: ColumnBase) -> ColumnBase:
     def sort_values(
         self: Self,
         ascending: bool = True,
-        na_position: str = "last",
+        na_position: Literal["first", "last"] = "last",
     ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
             return self.copy()
-        return libcudf.sort.sort(
-            [self], column_order=[ascending], null_precedence=[na_position]
-        )[0]
+        order = sorting.ordering([ascending], [na_position])
+        with acquire_spill_lock():
+            plc_table = plc.sorting.sort(
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                order[0],
+                order[1],
+            )
+            return type(self).from_pylibcudf(plc_table.columns()[0])  # type: ignore[return-value]
 
     def distinct_count(self, dropna: bool = True) -> int:
         try:
             return self._distinct_count[dropna]
         except KeyError:
-            self._distinct_count[dropna] = cpp_distinct_count(
-                self, ignore_nulls=dropna
-            )
+            with acquire_spill_lock():
+                result = plc.stream_compaction.distinct_count(
+                    self.to_pylibcudf(mode="read"),
+                    plc.types.NullPolicy.EXCLUDE
+                    if dropna
+                    else plc.types.NullPolicy.INCLUDE,
+                    plc.types.NanPolicy.NAN_IS_VALID,
+                )
+            self._distinct_count[dropna] = result
             return self._distinct_count[dropna]
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
@@ -1052,7 +1128,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             if self.dtype == dtype:
                 result = self
             else:
-                result = column_empty(0, dtype=dtype, masked=self.nullable)
+                result = column_empty(0, dtype=dtype)
         elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
@@ -1202,7 +1278,7 @@ def argsort(
                 as_column(range(len(self) - 1, -1, -1)),
             )
         else:
-            return libcudf.sort.order_by(
+            return sorting.order_by(
                 [self], [ascending], na_position, stable=True
             )
 
@@ -1269,7 +1345,7 @@ def unique(self) -> Self:
         if self.is_unique:
             return self.copy()
         else:
-            return drop_duplicates([self], keep="first")[
+            return drop_duplicates([self], keep="first")[  # type: ignore[return-value]
                 0
             ]._with_type_metadata(self.dtype)
 
@@ -1288,28 +1364,27 @@ def serialize(self) -> tuple[dict, list]:
 
         header: dict[Any, Any] = {}
         frames = []
-        header["type-serialized"] = pickle.dumps(type(self))
         try:
-            dtype, dtype_frames = self.dtype.serialize()
+            dtype, dtype_frames = self.dtype.device_serialize()
             header["dtype"] = dtype
             frames.extend(dtype_frames)
             header["dtype-is-cudf-serialized"] = True
         except AttributeError:
-            header["dtype"] = pickle.dumps(self.dtype)
+            header["dtype"] = self.dtype.str
             header["dtype-is-cudf-serialized"] = False
 
         if self.data is not None:
-            data_header, data_frames = self.data.serialize()
+            data_header, data_frames = self.data.device_serialize()
             header["data"] = data_header
             frames.extend(data_frames)
 
         if self.mask is not None:
-            mask_header, mask_frames = self.mask.serialize()
+            mask_header, mask_frames = self.mask.device_serialize()
             header["mask"] = mask_header
             frames.extend(mask_frames)
         if self.children:
             child_headers, child_frames = zip(
-                *(c.serialize() for c in self.children)
+                *(c.device_serialize() for c in self.children)
             )
             header["subheaders"] = list(child_headers)
             frames.extend(chain(*child_frames))
@@ -1321,8 +1396,7 @@ def serialize(self) -> tuple[dict, list]:
     def deserialize(cls, header: dict, frames: list) -> ColumnBase:
         def unpack(header, frames) -> tuple[Any, list]:
             count = header["frame_count"]
-            klass = pickle.loads(header["type-serialized"])
-            obj = klass.deserialize(header, frames[:count])
+            obj = cls.device_deserialize(header, frames[:count])
             return obj, frames[count:]
 
         assert header["frame_count"] == len(frames), (
@@ -1332,7 +1406,7 @@ def unpack(header, frames) -> tuple[Any, list]:
         if header["dtype-is-cudf-serialized"]:
             dtype, frames = unpack(header["dtype"], frames)
         else:
-            dtype = pickle.loads(header["dtype"])
+            dtype = np.dtype(header["dtype"])
         if "data" in header:
             data, frames = unpack(header["data"], frames)
         else:
@@ -1390,33 +1464,35 @@ def _reduce(
         )
         if isinstance(preprocessed, ColumnBase):
             dtype = kwargs.pop("dtype", None)
-            return libcudf.reduce.reduce(
-                op, preprocessed, dtype=dtype, **kwargs
-            )
+            return preprocessed.reduce(op, dtype, **kwargs)
         return preprocessed
 
+    def _can_return_nan(self, skipna: bool | None = None) -> bool:
+        return not skipna and self.has_nulls(include_nan=False)
+
     def _process_for_reduction(
         self, skipna: bool | None = None, min_count: int = 0
     ) -> ColumnBase | ScalarLike:
-        if skipna is None:
-            skipna = True
+        skipna = True if skipna is None else skipna
 
-        if self.has_nulls():
+        if self._can_return_nan(skipna=skipna):
+            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
+
+        col = self.nans_to_nulls() if skipna else self
+        if col.has_nulls():
             if skipna:
-                result_col = self.dropna()
+                col = col.dropna()
             else:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        result_col = self
-
         # TODO: If and when pandas decides to validate that `min_count` >= 0 we
         # should insert comparable behavior.
         # https://github.com/pandas-dev/pandas/issues/50022
         if min_count > 0:
-            valid_count = len(result_col) - result_col.null_count
+            valid_count = len(col) - col.null_count
             if valid_count < min_count:
                 return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-        return result_col
+        return col
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         """
@@ -1500,20 +1576,130 @@ def _return_sentinel_column():
         left_gather_map = type(self).from_pylibcudf(left_rows)
         right_gather_map = type(self).from_pylibcudf(right_rows)
 
-        codes = libcudf.copying.gather(
-            [as_column(range(len(cats)), dtype=dtype)],
-            right_gather_map,
-            nullify=True,
+        codes = as_column(range(len(cats)), dtype=dtype).take(
+            right_gather_map, nullify=True
         )
         del right_gather_map
         del right_rows
         # reorder `codes` so that its values correspond to the
         # values of `self`:
-        (codes,) = libcudf.sort.sort_by_key(
-            codes, [left_gather_map], [True], ["last"], stable=True
+        (codes,) = sorting.sort_by_key(
+            [codes], [left_gather_map], [True], ["last"], stable=True
         )
         return codes.fillna(na_sentinel.value)
 
+    @acquire_spill_lock()
+    def copy_if_else(
+        self, other: Self | cudf.Scalar, boolean_mask: NumericalColumn
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.copying.copy_if_else(
+                self.to_pylibcudf(mode="read"),
+                other.device_value.c_value
+                if isinstance(other, cudf.Scalar)
+                else other.to_pylibcudf(mode="read"),
+                boolean_mask.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def one_hot_encode(
+        self, categories: ColumnBase
+    ) -> abc.Generator[ColumnBase]:
+        plc_table = plc.transform.one_hot_encode(
+            self.to_pylibcudf(mode="read"),
+            categories.to_pylibcudf(mode="read"),
+        )
+        return (
+            type(self).from_pylibcudf(col, data_ptr_exposed=True)
+            for col in plc_table.columns()
+        )
+
+    @acquire_spill_lock()
+    def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.reduce.scan(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(scan_op, kwargs).plc_obj,
+                plc.reduce.ScanType.INCLUSIVE
+                if inclusive
+                else plc.reduce.ScanType.EXCLUSIVE,
+            )
+        )
+
+    def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
+        if dtype is not None:
+            warnings.warn(
+                "dtype is deprecated and will be remove in a future release. "
+                "Cast the result (e.g. .astype) after the operation instead.",
+                FutureWarning,
+            )
+            col_dtype = dtype
+        else:
+            col_dtype = self._reduction_result_dtype(reduction_op)
+
+        # check empty case
+        if len(self) <= self.null_count:
+            if reduction_op == "sum" or reduction_op == "sum_of_squares":
+                return self.dtype.type(0)
+            if reduction_op == "product":
+                return self.dtype.type(1)
+            if reduction_op == "any":
+                return False
+
+            return cudf.utils.dtypes._get_nan_for_dtype(col_dtype)
+
+        with acquire_spill_lock():
+            plc_scalar = plc.reduce.reduce(
+                self.to_pylibcudf(mode="read"),
+                aggregation.make_aggregation(reduction_op, kwargs).plc_obj,
+                dtype_to_pylibcudf_type(col_dtype),
+            )
+            result_col = type(self).from_pylibcudf(
+                plc.Column.from_scalar(plc_scalar, 1)
+            )
+            if plc_scalar.type().id() in {
+                plc.TypeId.DECIMAL128,
+                plc.TypeId.DECIMAL64,
+                plc.TypeId.DECIMAL32,
+            }:
+                scale = -plc_scalar.type().scale()
+                # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
+                p = col_dtype.precision
+                nrows = len(self)
+                if reduction_op in {"min", "max"}:
+                    new_p = p
+                elif reduction_op == "sum":
+                    new_p = p + nrows - 1
+                elif reduction_op == "product":
+                    new_p = p * nrows + nrows - 1
+                elif reduction_op == "sum_of_squares":
+                    new_p = 2 * p + nrows
+                else:
+                    raise NotImplementedError(
+                        f"{reduction_op} not implemented for decimal types."
+                    )
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
+                new_dtype = type(col_dtype)(precision, scale)
+                result_col = result_col.astype(new_dtype)
+            elif isinstance(col_dtype, cudf.IntervalDtype):
+                result_col = type(self).from_struct_column(  # type: ignore[attr-defined]
+                    result_col, closed=col_dtype.closed
+                )
+        return result_col.element_indexing(0)
+
+    @acquire_spill_lock()
+    def minmax(self) -> tuple[ScalarLike, ScalarLike]:
+        min_val, max_val = plc.reduce.minmax(self.to_pylibcudf(mode="read"))
+        return (
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(min_val, 1))
+            .element_indexing(0),
+            type(self)
+            .from_pylibcudf(plc.Column.from_scalar(max_val, 1))
+            .element_indexing(0),
+        )
+
 
 def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     """Check if an object dtype Series or array contains NaN."""
@@ -1526,7 +1712,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
 def column_empty(
     row_count: int,
     dtype: Dtype = "object",
-    masked: bool = False,
     for_numba: bool = False,
 ) -> ColumnBase:
     """
@@ -1543,9 +1728,6 @@ def column_empty(
     dtype : Dtype
         Type of the column.
 
-    masked : bool
-        Unused.
-
     for_numba : bool, default False
         If True, don't allocate a mask as it's not supported by numba.
     """
@@ -1817,12 +1999,12 @@ def as_column(
             column = Column.from_pylibcudf(
                 plc.filling.sequence(
                     len(arbitrary),
-                    as_device_scalar(
-                        arbitrary.start, dtype=np.dtype(np.int64)
-                    ).c_value,
-                    as_device_scalar(
-                        arbitrary.step, dtype=np.dtype(np.int64)
-                    ).c_value,
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.start, type=pa.int64())
+                    ),
+                    plc.interop.from_arrow(
+                        pa.scalar(arbitrary.step, type=pa.int64())
+                    ),
                 )
             )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
@@ -1924,18 +2106,26 @@ def as_column(
             if isinstance(arbitrary.dtype, pd.DatetimeTZDtype):
                 new_tz = get_compatible_timezone(arbitrary.dtype)
                 arbitrary = arbitrary.astype(new_tz)
-            if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance(
-                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
-            ):
-                new_tz = get_compatible_timezone(
-                    arbitrary.dtype.categories.dtype
-                )
-                new_cats = arbitrary.dtype.categories.astype(new_tz)
-                new_dtype = pd.CategoricalDtype(
-                    categories=new_cats, ordered=arbitrary.dtype.ordered
-                )
-                arbitrary = arbitrary.astype(new_dtype)
-
+            if isinstance(arbitrary.dtype, pd.CategoricalDtype):
+                if isinstance(
+                    arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+                ):
+                    new_tz = get_compatible_timezone(
+                        arbitrary.dtype.categories.dtype
+                    )
+                    new_cats = arbitrary.dtype.categories.astype(new_tz)
+                    new_dtype = pd.CategoricalDtype(
+                        categories=new_cats, ordered=arbitrary.dtype.ordered
+                    )
+                    arbitrary = arbitrary.astype(new_dtype)
+                elif (
+                    isinstance(
+                        arbitrary.dtype.categories.dtype, pd.IntervalDtype
+                    )
+                    and dtype is None
+                ):
+                    # Conversion to arrow converts IntervalDtype to StructDtype
+                    dtype = cudf.CategoricalDtype.from_pandas(arbitrary.dtype)
             return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
@@ -2093,8 +2283,7 @@ def as_column(
                     )
                 # Consider NaT as NA in the mask
                 # but maintain NaT as a value
-                bool_mask = as_column(~is_nat)
-                mask = as_buffer(bools_to_mask(bool_mask))
+                mask = as_column(~is_nat).as_mask()
             buffer = as_buffer(arbitrary.view("|u1"))
             col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
             if dtype:
@@ -2264,8 +2453,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         )
         return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
     elif typecode == "b":
-        col = as_column(cai_mask)
-        return bools_to_mask(col)
+        return as_column(cai_mask).as_mask()
     else:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
@@ -2291,7 +2479,9 @@ def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     frames = []
 
     if len(columns) > 0:
-        header_columns = [c.serialize() for c in columns]
+        header_columns: list[tuple[dict, list]] = [
+            c.device_serialize() for c in columns
+        ]
         headers, column_frames = zip(*header_columns)
         for f in column_frames:
             frames.extend(f)
@@ -2308,7 +2498,7 @@ def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
 
     for meta in headers:
         col_frame_count = meta["frame_count"]
-        col_typ = pickle.loads(meta["type-serialized"])
+        col_typ = Serializable._name_type_map[meta["type-serialized-name"]]
         colobj = col_typ.deserialize(meta, frames[:col_frame_count])
         columns.append(colobj)
         # Advance frames
@@ -2321,7 +2511,7 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     """Concatenate a sequence of columns."""
     if len(objs) == 0:
         dtype = cudf.dtype(None)
-        return column_empty(0, dtype=dtype, masked=True)
+        return column_empty(0, dtype=dtype)
 
     # If all columns are `NumericalColumn` with different dtypes,
     # we cast them to a common dtype.
@@ -2362,13 +2552,13 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
         )
 
     newsize = sum(map(len, objs))
-    if newsize > libcudf.MAX_COLUMN_SIZE:
+    if newsize > np.iinfo(libcudf.types.size_type_dtype).max:
         raise MemoryError(
             f"Result of concat cannot have "
-            f"size > {libcudf.MAX_COLUMN_SIZE_STR}"
+            f"size > {libcudf.types.size_type_dtype}_MAX"
         )
     elif newsize == 0:
-        return column_empty(0, head.dtype, masked=True)
+        return column_empty(0, head.dtype)
 
     # Filter out inputs that have 0 length, then concatenate.
     objs_with_len = [o for o in objs if len(o)]
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b526a6efa51..1bde7d27700 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -19,7 +19,6 @@
 
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
 from cudf import _lib as libcudf
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import binaryop, unary
@@ -213,6 +212,8 @@ class DatetimeColumn(column.ColumnBase):
         "__rsub__",
     }
 
+    _PANDAS_NA_REPR = str(pd.NaT)
+
     def __init__(
         self,
         data: Buffer,
@@ -352,11 +353,9 @@ def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
         leap_dates = self.is_leap_year
 
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
-        return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
-            False
-        )
+        leap = day_of_year == 366
+        non_leap = day_of_year == 365
+        return leap.copy_if_else(non_leap, leap_dates).fillna(False)
 
     @property
     def is_leap_year(self) -> ColumnBase:
@@ -598,17 +597,20 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
-            names = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
+            names = column.column_empty(0, dtype="object")
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                plc.strings.convert.convert_datetime.from_timestamps(
+                    self.to_pylibcudf(mode="read"),
+                    format,
+                    names.to_pylibcudf(mode="read"),
+                )
             )
-        return string._datetime_to_str_typecast_functions[self.dtype](
-            self, format, names
-        )
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         format = _dtype_to_format_conversion.get(
@@ -1016,7 +1018,7 @@ def to_pandas(
                 self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
             )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return pa.compute.assume_timezone(
             self._local_time.to_arrow(), str(self.dtype.tz)
         )
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 2c22724d3d7..09941665ba2 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -10,13 +10,12 @@
 import numpy as np
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib.strings.convert.convert_fixed_point import (
-    from_decimal as cpp_from_decimal,
-)
 from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop, unary
-from cudf.core.buffer import as_buffer
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import (
@@ -89,7 +88,13 @@ def as_decimal_column(
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
-            return cpp_from_decimal(self)
+            with acquire_spill_lock():
+                plc_column = (
+                    plc.strings.convert.convert_fixed_point.from_fixed_point(
+                        self.to_pylibcudf(mode="read"),
+                    )
+                )
+                return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
         else:
             return cast(
                 cudf.core.column.StringColumn,
@@ -264,8 +269,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")
+    def to_arrow(self) -> pa.Array:
+        data_buf_32 = np.array(self.base_data.memoryview()).view("int32")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_32) * 4, dtype="int32")
 
         # use striding to set the first 32 bits of each 128-bit chunk:
@@ -332,7 +337,7 @@ def from_arrow(cls, data: pa.Array):
         result.dtype.precision = data.type.precision
         return result
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         return super().to_arrow().cast(self.dtype.to_arrow())
 
     def _with_type_metadata(
@@ -391,8 +396,8 @@ def from_arrow(cls, data: pa.Array):
             mask=mask,
         )
 
-    def to_arrow(self):
-        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")
+    def to_arrow(self) -> pa.Array:
+        data_buf_64 = np.array(self.base_data.memoryview()).view("int64")  # type: ignore[union-attr]
         data_buf_128 = np.empty(len(data_buf_64) * 2, dtype="int64")
 
         # use striding to set the first 64 bits of each 128-bit chunk:
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 34975fc94f4..dd8f58a118e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -14,7 +14,6 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-    from cudf._typing import ScalarLike
     from cudf.core.buffer import Buffer
     from cudf.core.column import ColumnBase
 
@@ -211,16 +210,3 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
-
-    def _reduce(
-        self,
-        op: str,
-        skipna: bool | None = None,
-        min_count: int = 0,
-        *args,
-        **kwargs,
-    ) -> ScalarLike:
-        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
-        if cudf.get_option("mode.pandas_compatible"):
-            return pd.Interval(**result, closed=self.dtype.closed)
-        return result
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index ea384888388..6fc2b5d4ca2 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -14,7 +14,6 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.buffer import acquire_spill_lock
@@ -29,6 +28,7 @@
 
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
     from cudf.core.buffer import Buffer
+    from cudf.core.column.string import StringColumn
 
 
 class ListColumn(ColumnBase):
@@ -68,6 +68,16 @@ def __init__(
             children=children,
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        # TODO: handle if self.has_nulls(): case
+        return self
+
     @cached_property
     def memory_usage(self):
         n = super().memory_usage
@@ -151,7 +161,7 @@ def offsets(self) -> NumericalColumn:
         """
         return cast(NumericalColumn, self.children[0])
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         offsets = self.offsets.to_arrow()
         elements = (
             pa.nulls(len(self.elements))
@@ -161,7 +171,7 @@ def to_arrow(self):
         pa_type = pa.list_(elements.type)
 
         if self.nullable:
-            nbuf = pa.py_buffer(self.mask.memoryview())
+            nbuf = pa.py_buffer(self.mask.memoryview())  # type: ignore[union-attr]
             buffers = (nbuf, offsets.buffers()[1])
         else:
             buffers = offsets.buffers()
@@ -237,7 +247,7 @@ def from_sequences(
 
         # Build Data, Mask & Offsets
         for data in arbitrary:
-            if cudf._lib.scalar._is_null_host_scalar(data):
+            if cudf.utils.utils._is_null_host_scalar(data):
                 mask_col.append(False)
                 offset_vals.append(offset)
             else:
@@ -256,7 +266,7 @@ def from_sequences(
             data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
-            mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
+            mask=as_column(mask_col).as_mask(),
             offset=0,
             null_count=0,
             children=(offset_col, data_col),
@@ -272,8 +282,13 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
 
-        # Call libcudf to format the list column
-        return format_list_column(lc, separators)
+        with acquire_spill_lock():
+            plc_column = plc.strings.convert.convert_lists.format_list_column(
+                lc.to_pylibcudf(mode="read"),
+                plc.interop.from_arrow(pa.scalar("None")),
+                separators.to_pylibcudf(mode="read"),
+            )
+            return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # return a new list column with the same nested structure
@@ -376,20 +391,20 @@ def extract_element_column(self, index: ColumnBase) -> ColumnBase:
         )
 
     @acquire_spill_lock()
-    def contains_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def contains_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.contains(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
             )
         )
 
     @acquire_spill_lock()
-    def index_of_scalar(self, search_key: cudf.Scalar) -> ColumnBase:
+    def index_of_scalar(self, search_key: pa.Scalar) -> ColumnBase:
         return type(self).from_pylibcudf(
             plc.lists.index_of(
                 self.to_pylibcudf(mode="read"),
-                search_key.device_value.c_value,
+                plc.interop.from_arrow(search_key),
                 plc.lists.DuplicateFindOption.FIND_FIRST,
             )
         )
@@ -554,7 +569,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
         dtype: bool
         """
         return self._return_or_inplace(
-            self._column.contains_scalar(cudf.Scalar(search_key))
+            self._column.contains_scalar(pa.scalar(search_key))
         )
 
     def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
@@ -603,7 +618,7 @@ def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
         """
 
         if is_scalar(search_key):
-            result = self._column.index_of_scalar(cudf.Scalar(search_key))
+            result = self._column.index_of_scalar(pa.scalar(search_key))
         else:
             result = self._column.index_of_column(as_column(search_key))
         return self._return_or_inplace(result)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 9514aaeab50..70103745926 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -7,21 +7,23 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
+from numba.np import numpy_support
 from typing_extensions import Self
 
-import pylibcudf
+import pylibcudf as plc
 
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
-from cudf import _lib as libcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop, unary
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.numerical_base import NumericalBaseColumn
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
+from cudf.utils import cudautils
 from cudf.utils.dtypes import (
     find_common_type,
     min_column_type,
@@ -150,7 +152,7 @@ def __setitem__(self, key: Any, value: Any):
             cudf.Scalar(
                 value,
                 dtype=self.dtype
-                if cudf._lib.scalar._is_null_host_scalar(value)
+                if cudf.utils.utils._is_null_host_scalar(value)
                 else None,
             )
             if is_scalar(value)
@@ -179,13 +181,27 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
+    @acquire_spill_lock()
+    def transform(self, compiled_op, np_dtype: np.dtype) -> ColumnBase:
+        plc_column = plc.transform.transform(
+            self.to_pylibcudf(mode="read"),
+            compiled_op[0],
+            plc.column._datatype_from_dtype_desc(np_dtype.str[1:]),
+            True,
+        )
+        return type(self).from_pylibcudf(plc_column)
+
     def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         if callable(unaryop):
-            return libcudf.transform.transform(self, unaryop)
+            nb_type = numpy_support.from_dtype(self.dtype)
+            nb_signature = (nb_type,)
+            compiled_op = cudautils.compile_udf(unaryop, nb_signature)
+            np_dtype = np.dtype(compiled_op[1])
+            return self.transform(compiled_op, np_dtype)
 
         unaryop = unaryop.upper()
         unaryop = _unaryop_map.get(unaryop, unaryop)
-        unaryop = pylibcudf.unary.UnaryOperator[unaryop]
+        unaryop = plc.unary.UnaryOperator[unaryop]
         return unary.unary_operation(self, unaryop)
 
     def __invert__(self):
@@ -298,8 +314,11 @@ def nans_to_nulls(self: Self) -> Self:
         # Only floats can contain nan.
         if self.dtype.kind != "f" or self.nan_count == 0:
             return self
-        newmask = libcudf.transform.nans_to_nulls(self)
-        return self.set_mask(newmask)
+        with acquire_spill_lock():
+            mask, _ = plc.transform.nans_to_nulls(
+                self.to_pylibcudf(mode="read")
+            )
+            return self.set_mask(as_buffer(mask))
 
     def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         if isinstance(other, ColumnBase):
@@ -346,22 +365,38 @@ def normalize_binop_value(self, other: ScalarLike) -> Self | cudf.Scalar:
         else:
             return NotImplemented
 
-    def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != cudf.dtype("uint32"):
+    @acquire_spill_lock()
+    def int2ip(self) -> cudf.core.column.StringColumn:
+        if self.dtype != np.dtype(np.uint32):
             raise TypeError("Only uint32 type can be converted to ip")
-
-        return libcudf.string_casting.int2ip(self)
+        plc_column = plc.strings.convert.convert_ipv4.integers_to_ipv4(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
-        if len(self) > 0:
-            return string._numeric_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self)
-        else:
+        if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
                 column.column_empty(0, dtype="object"),
             )
+        elif self.dtype.kind == "b":
+            conv_func = functools.partial(
+                plc.strings.convert.convert_booleans.from_booleans,
+                true_string=plc.interop.from_arrow(pa.scalar("True")),
+                false_string=plc.interop.from_arrow(pa.scalar("False")),
+            )
+        elif self.dtype.kind in {"i", "u"}:
+            conv_func = plc.strings.convert.convert_integers.from_integers
+        elif self.dtype.kind == "f":
+            conv_func = plc.strings.convert.convert_floats.from_floats
+        else:
+            raise ValueError(f"No string conversion from type {self.dtype}")
+
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                conv_func(self.to_pylibcudf(mode="read"))
+            )
 
     def as_datetime_column(
         self, dtype: Dtype
@@ -400,22 +435,12 @@ def all(self, skipna: bool = True) -> bool:
         # If all entries are null the result is True, including when the column
         # is empty.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if result_col.null_count == result_col.size:
-            return True
-
-        return libcudf.reduce.reduce("all", result_col)
+        return super(type(self), result_col).all(skipna=skipna)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
         result_col = self.nans_to_nulls() if skipna else self
-
-        if not skipna and result_col.has_nulls():
-            return True
-        elif skipna and result_col.null_count == result_col.size:
-            return False
-
-        return libcudf.reduce.reduce("any", result_col)
+        return super(type(self), result_col).any(skipna=skipna)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -463,19 +488,6 @@ def _process_values_for_isin(
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
-    def _process_for_reduction(
-        self, skipna: bool | None = None, min_count: int = 0
-    ) -> NumericalColumn | ScalarLike:
-        skipna = True if skipna is None else skipna
-
-        if self._can_return_nan(skipna=skipna):
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        col = self.nans_to_nulls() if skipna else self
-        return super(NumericalColumn, col)._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-
     def find_and_replace(
         self,
         to_replace: ColumnLike,
@@ -721,6 +733,40 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
 
         return super()._reduction_result_dtype(reduction_op)
 
+    @acquire_spill_lock()
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
+        """Return the indices of the bins to which each value in column belongs.
+
+        Parameters
+        ----------
+        bins : np.ndarray
+            1-D column-like object of bins with same type as `column`, should be
+            monotonically increasing.
+        right : bool
+            Indicates whether interval contains the right or left bin edge.
+
+        Returns
+        -------
+        A column containing the indices
+        """
+        if self.dtype != bins.dtype:
+            raise ValueError(
+                "digitize() expects bins and input column have the same dtype."
+            )
+
+        bin_col = as_column(bins, dtype=bins.dtype)
+        if bin_col.nullable:
+            raise ValueError("`bins` cannot contain null entries.")
+
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            getattr(plc.search, "lower_bound" if right else "upper_bound")(
+                plc.Table([bin_col.to_pylibcudf(mode="read")]),
+                plc.Table([self.to_pylibcudf(mode="read")]),
+                [plc.types.Order.ASCENDING],
+                [plc.types.NullOrder.BEFORE],
+            )
+        )
+
 
 def _normalize_find_and_replace_input(
     input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list
@@ -740,7 +786,7 @@ def _normalize_find_and_replace_input(
         )
         # Scalar case
         if len(col_to_normalize) == 1:
-            if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
+            if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
             if np.isinf(col_to_normalize[0]):
                 return normalized_column
@@ -775,34 +821,3 @@ def _normalize_find_and_replace_input(
     if not normalized_column.can_cast_safely(input_column_dtype):
         return normalized_column
     return normalized_column.astype(input_column_dtype)
-
-
-def digitize(
-    column: ColumnBase, bins: np.ndarray, right: bool = False
-) -> ColumnBase:
-    """Return the indices of the bins to which each value in column belongs.
-
-    Parameters
-    ----------
-    column : Column
-        Input column.
-    bins : Column-like
-        1-D column-like object of bins with same type as `column`, should be
-        monotonically increasing.
-    right : bool
-        Indicates whether interval contains the right or left bin edge.
-
-    Returns
-    -------
-    A column containing the indices
-    """
-    if not column.dtype == bins.dtype:
-        raise ValueError(
-            "Digitize() expects bins and input column have the same dtype."
-        )
-
-    bin_col = as_column(bins, dtype=bins.dtype)
-    if bin_col.nullable:
-        raise ValueError("`bins` cannot contain null entries.")
-
-    return as_column(libcudf.sort.digitize([column], [bin_col], right))
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index ea242e34edb..689d5132d45 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,14 +3,14 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Literal, cast
 
 import numpy as np
 
 import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
+from cudf.core._internals import sorting
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.missing import NA
@@ -139,17 +139,18 @@ def quantile(
             result = cast(
                 NumericalBaseColumn,
                 cudf.core.column.column_empty(
-                    row_count=len(q), dtype=self.dtype, masked=True
+                    row_count=len(q), dtype=self.dtype
                 ),
             )
         else:
+            no_nans = self.nans_to_nulls()
             # get sorted indices and exclude nulls
-            indices = libcudf.sort.order_by(
-                [self], [True], "first", stable=True
-            ).slice(self.null_count, len(self))
+            indices = sorting.order_by(
+                [no_nans], [True], "first", stable=True
+            ).slice(no_nans.null_count, len(no_nans))
             with acquire_spill_lock():
                 plc_column = plc.quantiles.quantile(
-                    self.to_pylibcudf(mode="read"),
+                    no_nans.to_pylibcudf(mode="read"),
                     q,
                     plc.types.Interpolation[interpolation.upper()],
                     indices.to_pylibcudf(mode="read"),
@@ -246,14 +247,23 @@ def corr(self, other: NumericalBaseColumn) -> float:
         return cov / lhs_std / rhs_std
 
     def round(
-        self, decimals: int = 0, how: str = "half_even"
+        self,
+        decimals: int = 0,
+        how: Literal["half_even", "half_up"] = "half_even",
     ) -> NumericalBaseColumn:
         if not cudf.api.types.is_integer(decimals):
-            raise TypeError("Values in decimals must be integers")
-        """Round the values in the Column to the given number of decimals."""
-        return libcudf.round.round(self, decimal_places=decimals, how=how)
+            raise TypeError("Argument 'decimals' must an integer")
+        if how not in {"half_even", "half_up"}:
+            raise ValueError(f"{how=} must be either 'half_even' or 'half_up'")
+        plc_how = plc.round.RoundingMethod[how.upper()]
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                plc.round.round(
+                    self.to_pylibcudf(mode="read"), decimals, plc_how
+                )
+            )
 
     def _scan(self, op: str) -> ColumnBase:
-        return libcudf.reduce.scan(
-            op.replace("cum", ""), self, True
-        )._with_type_metadata(self.dtype)
+        return self.scan(op.replace("cum", ""), True)._with_type_metadata(
+            self.dtype
+        )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6b45828568c..20eded9a27f 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -19,9 +19,9 @@
 import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf._lib import string_casting as str_cast, strings as libstrings
+from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.types import size_type_dtype
+from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock
@@ -44,64 +44,11 @@
         SeriesOrIndex,
     )
     from cudf.core.buffer import Buffer
+    from cudf.core.column.lists import ListColumn
+    from cudf.core.column.numerical import NumericalColumn
 
 
-def str_to_boolean(column: StringColumn):
-    """Takes in string column and returns boolean column"""
-    with acquire_spill_lock():
-        plc_column = plc.strings.attributes.count_characters(
-            column.to_pylibcudf(mode="read")
-        )
-        result = Column.from_pylibcudf(plc_column)
-    return (result > cudf.Scalar(0, dtype="int8")).fillna(False)
-
-
-_str_to_numeric_typecast_functions = {
-    cudf.api.types.dtype("int8"): str_cast.stoi8,
-    cudf.api.types.dtype("int16"): str_cast.stoi16,
-    cudf.api.types.dtype("int32"): str_cast.stoi,
-    cudf.api.types.dtype("int64"): str_cast.stol,
-    cudf.api.types.dtype("uint8"): str_cast.stoui8,
-    cudf.api.types.dtype("uint16"): str_cast.stoui16,
-    cudf.api.types.dtype("uint32"): str_cast.stoui,
-    cudf.api.types.dtype("uint64"): str_cast.stoul,
-    cudf.api.types.dtype("float32"): str_cast.stof,
-    cudf.api.types.dtype("float64"): str_cast.stod,
-    cudf.api.types.dtype("bool"): str_to_boolean,
-}
-
-_numeric_to_str_typecast_functions = {
-    cudf.api.types.dtype("int8"): str_cast.i8tos,
-    cudf.api.types.dtype("int16"): str_cast.i16tos,
-    cudf.api.types.dtype("int32"): str_cast.itos,
-    cudf.api.types.dtype("int64"): str_cast.ltos,
-    cudf.api.types.dtype("uint8"): str_cast.ui8tos,
-    cudf.api.types.dtype("uint16"): str_cast.ui16tos,
-    cudf.api.types.dtype("uint32"): str_cast.uitos,
-    cudf.api.types.dtype("uint64"): str_cast.ultos,
-    cudf.api.types.dtype("float32"): str_cast.ftos,
-    cudf.api.types.dtype("float64"): str_cast.dtos,
-    cudf.api.types.dtype("bool"): str_cast.from_booleans,
-}
-
-_datetime_to_str_typecast_functions = {
-    # TODO: support Date32 UNIX days
-    # cudf.api.types.dtype("datetime64[D]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[s]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[ms]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[us]"): str_cast.int2timestamp,
-    cudf.api.types.dtype("datetime64[ns]"): str_cast.int2timestamp,
-}
-
-_timedelta_to_str_typecast_functions = {
-    cudf.api.types.dtype("timedelta64[s]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[ms]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[us]"): str_cast.int2timedelta,
-    cudf.api.types.dtype("timedelta64[ns]"): str_cast.int2timedelta,
-}
-
-
-def _is_supported_regex_flags(flags):
+def _is_supported_regex_flags(flags: int) -> bool:
     return flags == 0 or (
         (flags & (re.MULTILINE | re.DOTALL) != 0)
         and (flags & ~(re.MULTILINE | re.DOTALL) == 0)
@@ -152,10 +99,7 @@ def htoi(self) -> SeriesOrIndex:
         3       51966
         dtype: int64
         """
-
-        out = str_cast.htoi(self._column)
-
-        return self._return_or_inplace(out, inplace=False)
+        return self._return_or_inplace(self._column.hex_to_integers())
 
     hex_to_int = htoi
 
@@ -185,10 +129,7 @@ def ip2int(self) -> SeriesOrIndex:
         2            0
         dtype: int64
         """
-
-        out = str_cast.ip2int(self._column)
-
-        return self._return_or_inplace(out, inplace=False)
+        return self._return_or_inplace(self._column.ipv4_to_integers())
 
     ip_to_int = ip2int
 
@@ -361,8 +302,10 @@ def cat(self, others=None, sep=None, na_rep=None):
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_strings(
                     self._column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(na_rep, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(
+                        pa.scalar(na_rep, type=pa.string())
+                    ),
                 )
                 data = Column.from_pylibcudf(plc_column)
         else:
@@ -418,8 +361,10 @@ def cat(self, others=None, sep=None, na_rep=None):
                             )
                         ]
                     ),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(na_rep, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(
+                        pa.scalar(na_rep, type=pa.string())
+                    ),
                 )
                 data = Column.from_pylibcudf(plc_column)
 
@@ -581,11 +526,9 @@ def join(
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_list_elements(
                     strings_column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep).device_value.c_value,
-                    cudf.Scalar(string_na_rep).device_value.c_value,
-                    cudf._lib.scalar.DeviceScalar(
-                        "", cudf.dtype("object")
-                    ).c_value,
+                    plc.interop.from_arrow(pa.scalar(sep)),
+                    plc.interop.from_arrow(pa.scalar(string_na_rep)),
+                    plc.interop.from_arrow(pa.scalar("")),
                     plc.strings.combine.SeparatorOnNulls.YES,
                     plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
                 )
@@ -606,8 +549,8 @@ def join(
                 plc_column = plc.strings.combine.join_list_elements(
                     strings_column.to_pylibcudf(mode="read"),
                     sep_column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(sep_na_rep).device_value.c_value,
-                    cudf.Scalar(string_na_rep).device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(sep_na_rep)),
+                    plc.interop.from_arrow(pa.scalar(string_na_rep)),
                     plc.strings.combine.SeparatorOnNulls.YES,
                     plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT,
                 )
@@ -622,7 +565,7 @@ def join(
 
     def _split_by_character(self):
         col = self._column.fillna("")  # sanitize nulls
-        result_col = libstrings.character_tokenize(col)
+        result_col = col.character_tokenize()
 
         offset_col = col.children[0]
 
@@ -859,14 +802,14 @@ def contains(
             else:
                 if case is False:
                     input_column = self.lower()._column  # type: ignore[union-attr]
-                    plc_pat = cudf.Scalar(pat.lower(), dtype="str")  # type: ignore[union-attr]
+                    pat_normed = pat.lower()  # type: ignore[union-attr]
                 else:
                     input_column = self._column
-                    plc_pat = cudf.Scalar(pat, dtype="str")
+                    pat_normed = pat
                 with acquire_spill_lock():
                     plc_result = plc.strings.find.contains(
                         input_column.to_pylibcudf(mode="read"),
-                        plc_pat.device_value.c_value,
+                        plc.interop.from_arrow(pa.scalar(pat_normed)),
                     )
                     result_col = Column.from_pylibcudf(plc_result)
         else:
@@ -951,8 +894,8 @@ def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex:
         with acquire_spill_lock():
             plc_result = plc.strings.contains.like(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(pat, "str").device_value.c_value,
-                cudf.Scalar(esc, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(pat)),
+                plc.interop.from_arrow(pa.scalar(esc)),
             )
             result = Column.from_pylibcudf(plc_result)
 
@@ -1130,14 +1073,14 @@ def replace(
                     plc.strings.regex_program.RegexProgram.create(
                         pat, plc.strings.regex_flags.RegexFlags.DEFAULT
                     ),
-                    cudf.Scalar(repl, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(repl)),
                     n,
                 )
             else:
                 plc_result = plc.strings.replace.replace(
                     self._column.to_pylibcudf(mode="read"),
-                    cudf.Scalar(pat).device_value.c_value,
-                    cudf.Scalar(repl).device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar(pat)),
+                    plc.interop.from_arrow(pa.scalar(repl)),
                     n,
                 )
             result = Column.from_pylibcudf(plc_result)
@@ -1253,13 +1196,13 @@ def slice(
         2    cm
         dtype: object
         """
-        param_dtype = np.dtype(np.int32)
+        param_dtype = pa.int32()
         with acquire_spill_lock():
             plc_result = plc.strings.slice.slice_strings(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(start, param_dtype).device_value.c_value,
-                cudf.Scalar(stop, param_dtype).device_value.c_value,
-                cudf.Scalar(step, param_dtype).device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(start, param_dtype)),
+                plc.interop.from_arrow(pa.scalar(stop, param_dtype)),
+                plc.interop.from_arrow(pa.scalar(step, param_dtype)),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -1336,7 +1279,7 @@ def isinteger(self) -> SeriesOrIndex:
         2    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_integer(self._column))
+        return self._return_or_inplace(self._column.is_integer())
 
     def ishex(self) -> SeriesOrIndex:
         """
@@ -1377,7 +1320,7 @@ def ishex(self) -> SeriesOrIndex:
         4     True
         dtype: bool
         """
-        return self._return_or_inplace(str_cast.is_hex(self._column))
+        return self._return_or_inplace(self._column.is_hex())
 
     def istimestamp(self, format: str) -> SeriesOrIndex:
         """
@@ -1401,9 +1344,7 @@ def istimestamp(self, format: str) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(
-            str_cast.istimestamp(self._column, format)
-        )
+        return self._return_or_inplace(self._column.is_timestamp(format))
 
     def isfloat(self) -> SeriesOrIndex:
         r"""
@@ -1468,7 +1409,7 @@ def isfloat(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(libstrings.is_float(self._column))
+        return self._return_or_inplace(self._column.is_float())
 
     def isdecimal(self) -> SeriesOrIndex:
         """
@@ -1954,7 +1895,7 @@ def isipv4(self) -> SeriesOrIndex:
         3    False
         dtype: bool
         """
-        return self._return_or_inplace(str_cast.is_ipv4(self._column))
+        return self._return_or_inplace(self._column.is_ipv4())
 
     def lower(self) -> SeriesOrIndex:
         """
@@ -2235,7 +2176,7 @@ def filter_alphanum(
                 plc.strings.char_types.StringCharacterTypes.ALL_TYPES
                 if keep
                 else plc.strings.char_types.StringCharacterTypes.ALPHANUM,
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
                 plc.strings.char_types.StringCharacterTypes.ALPHANUM
                 if keep
                 else plc.strings.char_types.StringCharacterTypes.ALL_TYPES,
@@ -2379,7 +2320,7 @@ def slice_replace(
         with acquire_spill_lock():
             plc_result = plc.strings.replace.replace_slice(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
                 start,
                 stop,
             )
@@ -2560,7 +2501,7 @@ def get_json_object(
         with acquire_spill_lock():
             plc_result = plc.json.get_json_object(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(json_path, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(json_path)),
                 options,
             )
             result = Column.from_pylibcudf(plc_result)
@@ -2710,15 +2651,19 @@ def split(
         if len(str(pat)) <= 1:
             regex = False
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.split_re(self._column, pat, n)
+                    data = self._column.split_re(pat, n)
                 else:
-                    data = libstrings.split(
-                        self._column, cudf.Scalar(pat, "str"), n
+                    data = self._column.split(
+                        plc.interop.from_arrow(
+                            pa.scalar(pat, type=pa.string())
+                        ),
+                        n,
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
@@ -2726,10 +2671,10 @@ def split(
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.split_record_re(self._column, pat, n)
+                result_table = self._column.split_record_re(pat, n)
             else:
-                result_table = libstrings.split_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.split_record(
+                    plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2883,15 +2828,19 @@ def rsplit(
         if regex and isinstance(pat, re.Pattern):
             pat = pat.pattern
 
+        result_table: StringColumn | dict[int, StringColumn]
         if expand:
             if self._column.null_count == len(self._column):
                 result_table = {0: self._column.copy()}
             else:
                 if regex is True:
-                    data = libstrings.rsplit_re(self._column, pat, n)
+                    data = self._column.rsplit_re(pat, n)
                 else:
-                    data = libstrings.rsplit(
-                        self._column, cudf.Scalar(pat, "str"), n
+                    data = self._column.rsplit(
+                        plc.interop.from_arrow(
+                            pa.scalar(pat, type=pa.string())
+                        ),
+                        n,
                     )
                 if len(data) == 1 and data[0].null_count == len(self._column):
                     result_table = {}
@@ -2899,12 +2848,10 @@ def rsplit(
                     result_table = data
         else:
             if regex is True:
-                result_table = libstrings.rsplit_record_re(
-                    self._column, pat, n
-                )
+                result_table = self._column.rsplit_record_re(pat, n)
             else:
-                result_table = libstrings.rsplit_record(
-                    self._column, cudf.Scalar(pat, "str"), n
+                result_table = self._column.rsplit_record(
+                    plc.interop.from_arrow(pa.scalar(pat, type=pa.string())), n
                 )
 
         return self._return_or_inplace(result_table, expand=expand)
@@ -2989,7 +2936,9 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.partition(self._column, cudf.Scalar(sep, "str")),
+            self._column.partition(
+                plc.interop.from_arrow(pa.scalar(sep, type=pa.string()))
+            ),
             expand=expand,
         )
 
@@ -3054,7 +3003,9 @@ def rpartition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
             sep = " "
 
         return self._return_or_inplace(
-            libstrings.rpartition(self._column, cudf.Scalar(sep, "str")),
+            self._column.rpartition(
+                plc.interop.from_arrow(pa.scalar(sep, type=pa.string()))
+            ),
             expand=expand,
         )
 
@@ -3368,7 +3319,7 @@ def _strip(
             plc_result = plc.strings.strip.strip(
                 self._column.to_pylibcudf(mode="read"),
                 side,
-                cudf.Scalar(to_strip, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(to_strip, type=pa.string())),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -3985,7 +3936,7 @@ def _starts_ends_with(
                 f"{type(pat).__name__}"
             )
         elif is_scalar(pat):
-            plc_pat = cudf.Scalar(pat, "str").device_value.c_value
+            plc_pat = plc.interop.from_arrow(pa.scalar(pat, type=pa.string()))
         else:
             plc_pat = column.as_column(pat, dtype="str").to_pylibcudf(
                 mode="read"
@@ -4126,9 +4077,7 @@ def removesuffix(self, suffix: str) -> SeriesOrIndex:
         ends_column = self.endswith(suffix)._column  # type: ignore[union-attr]
         removed_column = self.slice(0, -len(suffix), None)._column  # type: ignore[union-attr]
 
-        result = cudf._lib.copying.copy_if_else(
-            removed_column, self._column, ends_column
-        )
+        result = removed_column.copy_if_else(self._column, ends_column)
         return self._return_or_inplace(result)
 
     def removeprefix(self, prefix: str) -> SeriesOrIndex:
@@ -4166,9 +4115,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex:
             return self._return_or_inplace(self._column)
         starts_column = self.startswith(prefix)._column  # type: ignore[union-attr]
         removed_column = self.slice(len(prefix), None, None)._column  # type: ignore[union-attr]
-        result = cudf._lib.copying.copy_if_else(
-            removed_column, self._column, starts_column
-        )
+        result = removed_column.copy_if_else(self._column, starts_column)
         return self._return_or_inplace(result)
 
     def _find(
@@ -4189,7 +4136,7 @@ def _find(
         with acquire_spill_lock():
             plc_result = method(
                 self._column.to_pylibcudf(mode="read"),
-                cudf.Scalar(sub, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(sub, type=pa.string())),
                 start,
                 end,
             )
@@ -4499,8 +4446,7 @@ def url_decode(self) -> SeriesOrIndex:
         1    https://medium.com/rapids-ai
         dtype: object
         """
-
-        return self._return_or_inplace(libstrings.url_decode(self._column))
+        return self._return_or_inplace(self._column.url_decode())
 
     def url_encode(self) -> SeriesOrIndex:
         """
@@ -4531,7 +4477,7 @@ def url_encode(self) -> SeriesOrIndex:
         1    https%3A%2F%2Fmedium.com%2Frapids-ai
         dtype: object
         """
-        return self._return_or_inplace(libstrings.url_encode(self._column))
+        return self._return_or_inplace(self._column.url_encode())
 
     def code_points(self) -> SeriesOrIndex:
         """
@@ -4673,7 +4619,7 @@ def filter_characters(
                 plc.strings.translate.FilterType.KEEP
                 if keep
                 else plc.strings.translate.FilterType.REMOVE,
-                cudf.Scalar(repl, "str").device_value.c_value,
+                plc.interop.from_arrow(pa.scalar(repl, type=pa.string())),
             )
             result = Column.from_pylibcudf(plc_result)
         return self._return_or_inplace(result)
@@ -4696,9 +4642,7 @@ def normalize_spaces(self) -> SeriesOrIndex:
         1    test string
         dtype: object
         """
-        return self._return_or_inplace(
-            libstrings.normalize_spaces(self._column)
-        )
+        return self._return_or_inplace(self._column.normalize_spaces())
 
     def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
@@ -4746,7 +4690,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         dtype: object
         """
         return self._return_or_inplace(
-            libstrings.normalize_characters(self._column, do_lower)
+            self._column.normalize_characters(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -4778,16 +4722,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         2    goodbye
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
 
-        if isinstance(delimiter, Column):
+        if isinstance(delim, Column):
             result = self._return_or_inplace(
-                libstrings._tokenize_column(self._column, delimiter),
+                self._column.tokenize_column(delim),  # type: ignore[arg-type]
                 retain_index=False,
             )
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, plc.Scalar):
             result = self._return_or_inplace(
-                libstrings._tokenize_scalar(self._column, delimiter),
+                self._column.tokenize_scalar(delim),
                 retain_index=False,
             )
         else:
@@ -4802,7 +4746,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
         return result
 
     def detokenize(
-        self, indices: "cudf.Series", separator: str = " "
+        self, indices: cudf.Series, separator: str = " "
     ) -> SeriesOrIndex:
         """
         Combines tokens into strings by concatenating them in the order
@@ -4832,9 +4776,9 @@ def detokenize(
         2          three
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.detokenize(self._column, indices._column, separator),
+            self._column.detokenize(indices._column, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -4885,17 +4829,15 @@ def character_tokenize(self) -> SeriesOrIndex:
         2    .
         dtype: object
         """
-        result_col = libstrings.character_tokenize(self._column)
+        result_col = self._column.character_tokenize()
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series._from_column(
+            return type(self._parent)._from_column(
                 result_col, name=self._parent.name, index=index
             )
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
-            return result_col
+            return self._return_or_inplace(result_col)
 
     def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         """
@@ -4922,15 +4864,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex:
         2    0
         dtype: int32
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True)
-        if isinstance(delimiter, Column):
+        delim = _massage_string_arg(delimiter, "delimiter", allow_col=True)
+        if isinstance(delim, Column):
             return self._return_or_inplace(
-                libstrings._count_tokens_column(self._column, delimiter)
+                self._column.count_tokens_column(delim)  # type: ignore[arg-type]
             )
 
-        elif isinstance(delimiter, cudf.Scalar):
+        elif isinstance(delim, plc.Scalar):
             return self._return_or_inplace(
-                libstrings._count_tokens_scalar(self._column, delimiter)
+                self._column.count_tokens_scalar(delim)  # type: ignore[arg-type]
             )
         else:
             raise TypeError(
@@ -4969,9 +4911,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex:
         2    xyz_hhh
         dtype: object
         """
-        separator = _massage_string_arg(separator, "separator")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.generate_ngrams(self._column, n, separator),
+            self._column.generate_ngrams(n, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5018,7 +4960,7 @@ def character_ngrams(
         dtype: list
         """
         result = self._return_or_inplace(
-            libstrings.generate_character_ngrams(self._column, n),
+            self._column.generate_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5063,7 +5005,7 @@ def hash_character_ngrams(
         """
 
         result = self._return_or_inplace(
-            libstrings.hash_character_ngrams(self._column, n),
+            self._column.hash_character_ngrams(n),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -5101,10 +5043,10 @@ def ngrams_tokenize(
         2    best_book
         dtype: object
         """
-        delimiter = _massage_string_arg(delimiter, "delimiter")
-        separator = _massage_string_arg(separator, "separator")
+        delim = _massage_string_arg(delimiter, "delimiter")
+        sep = _massage_string_arg(separator, "separator")
         return self._return_or_inplace(
-            libstrings.ngrams_tokenize(self._column, n, delimiter, separator),
+            self._column.ngrams_tokenize(n, delim, sep),  # type: ignore[arg-type]
             retain_index=False,
         )
 
@@ -5183,11 +5125,10 @@ def replace_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.replace_tokens(
-                self._column,
-                targets_column,
-                replacements_column,
-                cudf.Scalar(delimiter, dtype="str"),
+            self._column.replace_tokens(
+                targets_column,  # type: ignore[arg-type]
+                replacements_column,  # type: ignore[arg-type]
+                plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
             ),
         )
 
@@ -5254,11 +5195,12 @@ def filter_tokens(
             )
 
         return self._return_or_inplace(
-            libstrings.filter_tokens(
-                self._column,
+            self._column.filter_tokens(
                 min_token_length,
-                cudf.Scalar(replacement, dtype="str"),
-                cudf.Scalar(delimiter, dtype="str"),
+                plc.interop.from_arrow(
+                    pa.scalar(replacement, type=pa.string())
+                ),
+                plc.interop.from_arrow(pa.scalar(delimiter, type=pa.string())),
             ),
         )
 
@@ -5281,9 +5223,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex:
         1    2
         dtype: int32
         """
-        return self._return_or_inplace(
-            libstrings.porter_stemmer_measure(self._column)
-        )
+        return self._return_or_inplace(self._column.porter_stemmer_measure())
 
     def is_consonant(self, position) -> SeriesOrIndex:
         """
@@ -5316,17 +5256,10 @@ def is_consonant(self, position) -> SeriesOrIndex:
         1    False
         dtype: bool
         """
-        ltype = libstrings.LetterType.CONSONANT
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(False, position)  # type: ignore[arg-type]
         )
 
     def is_vowel(self, position) -> SeriesOrIndex:
@@ -5360,17 +5293,10 @@ def is_vowel(self, position) -> SeriesOrIndex:
         1     True
         dtype: bool
         """
-        ltype = libstrings.LetterType.VOWEL
-
         if can_convert_to_column(position):
-            return self._return_or_inplace(
-                libstrings.is_letter_multi(
-                    self._column, ltype, column.as_column(position)
-                ),
-            )
-
+            position = column.as_column(position)
         return self._return_or_inplace(
-            libstrings.is_letter(self._column, ltype, position)
+            self._column.is_letter(True, position)  # type: ignore[arg-type]
         )
 
     def edit_distance(self, targets) -> SeriesOrIndex:
@@ -5419,7 +5345,7 @@ def edit_distance(self, targets) -> SeriesOrIndex:
             )
 
         return self._return_or_inplace(
-            libstrings.edit_distance(self._column, targets_column)
+            self._column.edit_distance(targets_column)  # type: ignore[arg-type]
         )
 
     def edit_distance_matrix(self) -> SeriesOrIndex:
@@ -5459,54 +5385,9 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
                 "Cannot compute edit distance between null strings. "
                 "Consider removing them using `dropna` or fill with `fillna`."
             )
-        return self._return_or_inplace(
-            libstrings.edit_distance_matrix(self._column)
-        )
+        return self._return_or_inplace(self._column.edit_distance_matrix())
 
     def minhash(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0     [21141582]
-        1    [962346254]
-        dtype: list
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> str_series.str.minhash(seeds)
-        0    [21141582, 403093213, 1258052021]
-        1    [962346254, 677440381, 122618762]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash(self._column, seeds_column, width)
-        )
-
-    def minhash_permuted(
         self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5538,7 +5419,7 @@ def minhash_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book'])
         >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
         >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
-        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash(0, a=a, b=b, width=5)
         0    [1305480171, 462824409, 74608232]
         1       [32665388, 65330773, 97996158]
         dtype: list
@@ -5554,53 +5435,10 @@ def minhash_permuted(
                 f"Expecting a Series with dtype uint32, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash_permuted(
-                self._column, seed, a_column, b_column, width
-            )
+            self._column.minhash(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def minhash64(
-        self, seeds: ColumnLike | None = None, width: int = 4
-    ) -> SeriesOrIndex:
-        """
-        Compute the minhash of a strings column.
-
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-        width : int
-            The width of the substring to hash.
-            Default is 4 characters.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> str_series = cudf.Series(['this is my', 'favorite book'])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> str_series.str.minhash64(seeds)
-        0    [3232308021562742685, 4445611509348165860, 586435843695903598]
-        1    [23008204270530356, 1281229757012344693, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.minhash64(self._column, seeds_column, width)
-        )
-
-    def minhash64_permuted(
         self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
         """
@@ -5631,7 +5469,7 @@ def minhash64_permuted(
         >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
         >>> a = cudf.Series([2, 3], dtype=np.uint64)
         >>> b = cudf.Series([5, 6], dtype=np.uint64)
-        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        >>> s.str.minhash64(0, a=a, b=b, width=5)
         0    [172452388517576012, 316595762085180527]
         1      [71427536958126239, 58787297728258215]
         2    [423885828176437114, 1140588505926961370]
@@ -5648,79 +5486,7 @@ def minhash64_permuted(
                 f"Expecting a Series with dtype uint64, got {type(b)}"
             )
         return self._return_or_inplace(
-            libstrings.minhash64_permuted(
-                self._column, seed, a_column, b_column, width
-            )
-        )
-
-    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x86_32 algorithm for the hash function.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint32.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        >>> ls.str.word_minhash(seeds=seeds)
-        0     [21141582, 1232889953, 1268336794]
-        1    [962346254, 2321233602, 1354839212]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint32:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash(self._column, seeds_column)
-        )
-
-    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
-        """
-        Compute the minhash of a list column of strings.
-        This uses the MurmurHash3_x64_128 algorithm for the hash function.
-        This function generates 2 uint64 values but only the first
-        uint64 value is used.
-
-        Parameters
-        ----------
-        seeds : ColumnLike
-            The seeds used for the hash algorithm.
-            Must be of type uint64.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> import numpy as np
-        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-        >>> ls.str.word_minhash64(seeds)
-        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
-        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
-        dtype: list
-        """
-        if seeds is None:
-            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
-        else:
-            seeds_column = column.as_column(seeds)
-            if seeds_column.dtype != np.uint64:
-                raise ValueError(
-                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
-                )
-        return self._return_or_inplace(
-            libstrings.word_minhash64(self._column, seeds_column)
+            self._column.minhash64(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
@@ -5746,26 +5512,27 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         1    0.307692
         dtype: float32
         """
-
         return self._return_or_inplace(
-            libstrings.jaccard_index(self._column, input._column, width),
+            self._column.jaccard_index(input._column, width)
         )
 
 
-def _massage_string_arg(value, name, allow_col=False):
+def _massage_string_arg(
+    value, name, allow_col: bool = False
+) -> StringColumn | plc.Scalar:
     if isinstance(value, cudf.Scalar):
         return value
 
     if isinstance(value, str):
-        return cudf.Scalar(value, dtype="str")
+        return plc.interop.from_arrow(pa.scalar(value, type=pa.string()))
 
     allowed_types = ["Scalar"]
 
     if allow_col:
         if isinstance(value, list):
-            return column.as_column(value, dtype="str")
+            return column.as_column(value, dtype="str")  # type: ignore[return-value]
 
-        if isinstance(value, Column) and is_string_dtype(value.dtype):
+        if isinstance(value, StringColumn):
             return value
 
         allowed_types.append("Column")
@@ -5998,8 +5765,8 @@ def sum(
             with acquire_spill_lock():
                 plc_column = plc.strings.combine.join_strings(
                     result_col.to_pylibcudf(mode="read"),
-                    cudf.Scalar("").device_value.c_value,
-                    cudf.Scalar(None, "str").device_value.c_value,
+                    plc.interop.from_arrow(pa.scalar("")),
+                    plc.interop.from_arrow(pa.scalar(None, type=pa.string())),
                 )
                 return Column.from_pylibcudf(plc_column).element_indexing(0)
         else:
@@ -6009,26 +5776,38 @@ def __contains__(self, item: ScalarLike) -> bool:
         other = [item] if is_scalar(item) else item
         return self.contains(column.as_column(other, dtype=self.dtype)).any()
 
-    def as_numerical_column(
-        self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
+    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         out_dtype = cudf.api.types.dtype(dtype)
-        string_col = self
-        if out_dtype.kind in {"i", "u"}:
-            if not libstrings.is_integer(string_col).all():
+        if out_dtype.kind == "b":
+            with acquire_spill_lock():
+                plc_column = plc.strings.attributes.count_characters(
+                    self.to_pylibcudf(mode="read")
+                )
+                result = Column.from_pylibcudf(plc_column)
+            return (result > np.int8(0)).fillna(False)
+        elif out_dtype.kind in {"i", "u"}:
+            if not self.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
+            cast_func = plc.strings.convert.convert_integers.to_integers
         elif out_dtype.kind == "f":
-            if not libstrings.is_float(string_col).all():
+            if not self.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
                     "type due to presence of non-floating values."
                 )
-
-        result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
-        return result_col
+            cast_func = plc.strings.convert.convert_floats.to_floats
+        else:
+            raise ValueError(
+                f"dtype must be a numerical type, not {out_dtype}"
+            )
+        plc_dtype = dtype_to_pylibcudf_type(out_dtype)
+        with acquire_spill_lock():
+            return type(self).from_pylibcudf(  # type: ignore[return-value]
+                cast_func(self.to_pylibcudf(mode="read"), plc_dtype)
+            )
 
     def strptime(
         self, dtype: Dtype, format: str
@@ -6038,7 +5817,7 @@ def strptime(
                 f"dtype must be datetime or timedelta type, not {dtype}"
             )
         elif self.null_count == len(self):
-            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+            return column.column_empty(len(self), dtype=dtype)  # type: ignore[return-value]
         elif (self == "None").any():
             raise ValueError(
                 "Cannot convert `None` value to datetime or timedelta."
@@ -6063,23 +5842,27 @@ def strptime(
                 raise NotImplementedError(
                     "Cannot parse date-like strings with different formats"
                 )
-            valid_ts = str_cast.istimestamp(self, format)
+            valid_ts = self.is_timestamp(format)
             valid = valid_ts | is_nat
             if not valid.all():
                 raise ValueError(f"Column contains invalid data for {format=}")
 
-            casting_func = str_cast.timestamp2int
+            casting_func = plc.strings.convert.convert_datetime.to_timestamps
             add_back_nat = is_nat.any()
         elif dtype.kind == "m":  # type: ignore[union-attr]
-            casting_func = str_cast.timedelta2int
+            casting_func = plc.strings.convert.convert_durations.to_durations
             add_back_nat = False
 
-        result_col = casting_func(self, dtype, format)
+        with acquire_spill_lock():
+            plc_dtype = dtype_to_pylibcudf_type(dtype)
+            result_col = type(self).from_pylibcudf(
+                casting_func(self.to_pylibcudf(mode="read"), plc_dtype, format)
+            )
 
         if add_back_nat:
             result_col[is_nat] = None
 
-        return result_col
+        return result_col  # type: ignore[return-value]
 
     def as_datetime_column(
         self, dtype: Dtype
@@ -6099,10 +5882,17 @@ def as_timedelta_column(
     ) -> cudf.core.column.TimeDeltaColumn:
         return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
+    @acquire_spill_lock()
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.DecimalBaseColumn":
-        return libstrings.to_decimal(self, dtype)
+    ) -> cudf.core.column.DecimalBaseColumn:
+        plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point(
+            self.to_pylibcudf(mode="read"),
+            libcudf.types.dtype_to_pylibcudf_type(dtype),
+        )
+        result = Column.from_pylibcudf(plc_column)
+        result.dtype.precision = dtype.precision  # type: ignore[union-attr]
+        return result  # type: ignore[return-value]
 
     def as_string_column(self) -> StringColumn:
         return self
@@ -6138,12 +5928,9 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
 
         if self.dtype == to_dtype:
             return True
-        elif (
-            to_dtype.kind in {"i", "u"}
-            and not libstrings.is_integer(self).all()
-        ):
+        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
             return False
-        elif to_dtype.kind == "f" and not libstrings.is_float(self).all():
+        elif to_dtype.kind == "f" and not self.is_float().all():
             return False
         else:
             return True
@@ -6264,8 +6051,10 @@ def _binaryop(
                                 rhs.to_pylibcudf(mode="read"),
                             ]
                         ),
-                        cudf.Scalar("").device_value.c_value,
-                        cudf.Scalar(None, "str").device_value.c_value,
+                        plc.interop.from_arrow(pa.scalar("")),
+                        plc.interop.from_arrow(
+                            pa.scalar(None, type=pa.string())
+                        ),
                     )
                     return Column.from_pylibcudf(plc_column)
             elif op in {
@@ -6305,15 +6094,287 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         return to_view.view(dtype)
 
+    @acquire_spill_lock()
+    def minhash(
+        self,
+        seed: np.uint32,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def minhash64(
+        self,
+        seed: np.uint64,
+        a: NumericalColumn,
+        b: NumericalColumn,
+        width: int,
+    ) -> ListColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.minhash.minhash64(
+                self.to_pylibcudf(mode="read"),
+                seed,
+                a.to_pylibcudf(mode="read"),
+                b.to_pylibcudf(mode="read"),
+                width,
+            )
+        )
+
+    @acquire_spill_lock()
+    def jaccard_index(self, other: Self, width: int) -> NumericalColumn:
+        result = plc.nvtext.jaccard.jaccard_index(
+            self.to_pylibcudf(mode="read"),
+            other.to_pylibcudf(mode="read"),
+            width,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_ngrams(self, ngrams: int, separator: plc.Scalar) -> Self:
+        result = plc.nvtext.generate_ngrams.generate_ngrams(
+            self.to_pylibcudf(mode="read"),
+            ngrams,
+            separator,
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def generate_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.generate_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def hash_character_ngrams(self, ngrams: int) -> ListColumn:
+        result = plc.nvtext.generate_ngrams.hash_character_ngrams(
+            self.to_pylibcudf(mode="read"), ngrams
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance(self, targets: Self) -> NumericalColumn:
+        result = plc.nvtext.edit_distance.edit_distance(
+            self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def edit_distance_matrix(self) -> ListColumn:
+        result = plc.nvtext.edit_distance.edit_distance_matrix(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(result)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def byte_pair_encoding(
+        self,
+        merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs,
+        separator: str,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.byte_pair_encode.byte_pair_encoding(
+                self.to_pylibcudf(mode="read"),
+                merge_pairs,
+                plc.interop.from_arrow(pa.scalar(separator)),
+            )
+        )
+
+    @acquire_spill_lock()
+    def ngrams_tokenize(
+        self,
+        ngrams: int,
+        delimiter: plc.Scalar,
+        separator: plc.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.ngrams_tokenize.ngrams_tokenize(
+                self.to_pylibcudf(mode="read"),
+                ngrams,
+                delimiter,
+                separator,
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_spaces(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_spaces(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def normalize_characters(self, do_lower: bool = True) -> Self:
+        return Column.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                do_lower,
+            )
+        )
+
+    @acquire_spill_lock()
+    def replace_tokens(
+        self, targets: Self, replacements: Self, delimiter: plc.Scalar
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.replace_tokens(
+                self.to_pylibcudf(mode="read"),
+                targets.to_pylibcudf(mode="read"),
+                replacements.to_pylibcudf(mode="read"),
+                delimiter,
+            )
+        )
+
+    @acquire_spill_lock()
+    def filter_tokens(
+        self,
+        min_token_length: int,
+        replacement: plc.Scalar,
+        delimiter: plc.Scalar,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.replace.filter_tokens(
+                self.to_pylibcudf(mode="read"),
+                min_token_length,
+                replacement,
+                delimiter,
+            )
+        )
+
+    @acquire_spill_lock()
+    def porter_stemmer_measure(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.porter_stemmer_measure(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.stemmer.is_letter(
+                self.to_pylibcudf(mode="read"),
+                is_vowel,
+                index
+                if isinstance(index, int)
+                else index.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def subword_tokenize(
+        self,
+        hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary,
+        max_sequence_length: int = 64,
+        stride: int = 48,
+        do_lower: bool = True,
+        do_truncate: bool = False,
+    ) -> tuple[ColumnBase, ColumnBase, ColumnBase]:
+        """
+        Subword tokenizes text series by using the pre-loaded hashed vocabulary
+        """
+        result = plc.nvtext.subword_tokenize.subword_tokenize(
+            self.to_pylibcudf(mode="read"),
+            hashed_vocabulary,
+            max_sequence_length,
+            stride,
+            do_lower,
+            do_truncate,
+        )
+        # return the 3 tensor components
+        tokens = type(self).from_pylibcudf(result[0])
+        masks = type(self).from_pylibcudf(result[1])
+        metadata = type(self).from_pylibcudf(result[2])
+        return tokens, masks, metadata
+
+    @acquire_spill_lock()
+    def tokenize_scalar(self, delimiter: plc.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_scalar(
+                self.to_pylibcudf(mode="read"), delimiter
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_column(self, delimiters: Self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_scalar(self, delimiter: plc.Scalar) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_scalar(
+                self.to_pylibcudf(mode="read"), delimiter
+            )
+        )
+
+    @acquire_spill_lock()
+    def count_tokens_column(self, delimiters: Self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.count_tokens_column(
+                self.to_pylibcudf(mode="read"),
+                delimiters.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def character_tokenize(self) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.character_tokenize(
+                self.to_pylibcudf(mode="read")
+            )
+        )
+
+    @acquire_spill_lock()
+    def tokenize_with_vocabulary(
+        self,
+        vocabulary: plc.nvtext.tokenize.TokenizeVocabulary,
+        delimiter: str,
+        default_id: int,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.tokenize_with_vocabulary(
+                self.to_pylibcudf(mode="read"),
+                vocabulary,
+                plc.interop.from_arrow(pa.scalar(delimiter)),
+                default_id,
+            )
+        )
+
+    @acquire_spill_lock()
+    def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.tokenize.detokenize(
+                self.to_pylibcudf(mode="read"),
+                indices.to_pylibcudf(mode="read"),
+                separator,
+            )
+        )
+
+    @acquire_spill_lock()
     def _modify_characters(
         self, method: Callable[[plc.Column], plc.Column]
     ) -> Self:
         """
         Helper function for methods that modify characters e.g. to_lower
         """
-        with acquire_spill_lock():
-            plc_column = method(self.to_pylibcudf(mode="read"))
-            return cast(Self, Column.from_pylibcudf(plc_column))
+        plc_column = method(self.to_pylibcudf(mode="read"))
+        return cast(Self, Column.from_pylibcudf(plc_column))
 
     def to_lower(self) -> Self:
         return self._modify_characters(plc.strings.case.to_lower)
@@ -6333,11 +6394,220 @@ def title(self) -> Self:
     def is_title(self) -> Self:
         return self._modify_characters(plc.strings.capitalize.is_title)
 
+    @acquire_spill_lock()
     def replace_multiple(self, pattern: Self, replacements: Self) -> Self:
-        with acquire_spill_lock():
-            plc_result = plc.strings.replace.replace_multiple(
+        plc_result = plc.strings.replace.replace_multiple(
+            self.to_pylibcudf(mode="read"),
+            pattern.to_pylibcudf(mode="read"),
+            replacements.to_pylibcudf(mode="read"),
+        )
+        return cast(Self, Column.from_pylibcudf(plc_result))
+
+    @acquire_spill_lock()
+    def is_hex(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_integers.is_hex(
                 self.to_pylibcudf(mode="read"),
-                pattern.to_pylibcudf(mode="read"),
-                replacements.to_pylibcudf(mode="read"),
             )
-            return cast(Self, Column.from_pylibcudf(plc_result))
+        )
+
+    @acquire_spill_lock()
+    def hex_to_integers(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_integers.hex_to_integers(
+                self.to_pylibcudf(mode="read"), plc.DataType(plc.TypeId.INT64)
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_ipv4(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_ipv4.is_ipv4(
+                self.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def ipv4_to_integers(self) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_ipv4.ipv4_to_integers(
+                self.to_pylibcudf(mode="read"),
+            )
+        )
+
+    @acquire_spill_lock()
+    def is_timestamp(self, format: str) -> NumericalColumn:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.strings.convert.convert_datetime.is_timestamp(
+                self.to_pylibcudf(mode="read"), format
+            )
+        )
+
+    @acquire_spill_lock()
+    def _split_record_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Column,
+        ],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return cast(Self, Column.from_pylibcudf(plc_column))
+
+    def split_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.split_record_re
+        )
+
+    def rsplit_record_re(self, pattern: str, maxsplit: int) -> Self:
+        return self._split_record_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_record_re
+        )
+
+    @acquire_spill_lock()
+    def _split_re(
+        self,
+        pattern: str,
+        maxsplit: int,
+        method: Callable[
+            [plc.Column, plc.strings.regex_program.RegexProgram, int],
+            plc.Table,
+        ],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            plc.strings.regex_program.RegexProgram.create(
+                pattern,
+                plc.strings.regex_flags.RegexFlags.DEFAULT,
+            ),
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.split_re
+        )
+
+    def rsplit_re(self, pattern: str, maxsplit: int) -> dict[int, Self]:
+        return self._split_re(
+            pattern, maxsplit, plc.strings.split.split.rsplit_re
+        )
+
+    @acquire_spill_lock()
+    def _split_record(
+        self,
+        delimiter: plc.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> Self:
+        plc_column = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter,
+            maxsplit,
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    def split_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.split_record
+        )
+
+    def rsplit_record(self, delimiter: plc.Scalar, maxsplit: int) -> Self:
+        return self._split_record(
+            delimiter, maxsplit, plc.strings.split.split.rsplit_record
+        )
+
+    @acquire_spill_lock()
+    def _split(
+        self,
+        delimiter: plc.Scalar,
+        maxsplit: int,
+        method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter,
+            maxsplit,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def split(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.split)
+
+    def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
+        return self._split(delimiter, maxsplit, plc.strings.split.split.rsplit)
+
+    @acquire_spill_lock()
+    def _partition(
+        self,
+        delimiter: plc.Scalar,
+        method: Callable[[plc.Column, plc.Scalar], plc.Column],
+    ) -> dict[int, Self]:
+        plc_table = method(
+            self.to_pylibcudf(mode="read"),
+            delimiter,
+        )
+        return dict(
+            enumerate(
+                Column.from_pylibcudf(col)  # type: ignore[misc]
+                for col in plc_table.columns()
+            )
+        )
+
+    def partition(self, delimiter: plc.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.partition
+        )
+
+    def rpartition(self, delimiter: plc.Scalar) -> dict[int, Self]:
+        return self._partition(
+            delimiter, plc.strings.split.partition.rpartition
+        )
+
+    @acquire_spill_lock()
+    def url_decode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_decode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def url_encode(self) -> Self:
+        plc_column = plc.strings.convert.convert_urls.url_encode(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_integer(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_integers.is_integer(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
+
+    @acquire_spill_lock()
+    def is_float(self) -> NumericalColumn:
+        plc_column = plc.strings.convert.convert_floats.is_float(
+            self.to_pylibcudf(mode="read")
+        )
+        return type(self).from_pylibcudf(plc_column)  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index db6ad72ab56..052a68cec98 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 from functools import cached_property
@@ -18,6 +18,7 @@
 
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
+    from cudf.core.column.string import StringColumn
 
 
 class StructColumn(ColumnBase):
@@ -51,6 +52,16 @@ def __init__(
             children=children,
         )
 
+    def _prep_pandas_compat_repr(self) -> StringColumn | Self:
+        """
+        Preprocess Column to be compatible with pandas repr, namely handling nulls.
+
+        * null (datetime/timedelta) = str(pd.NaT)
+        * null (other types)= str(pd.NA)
+        """
+        # TODO: handle if self.has_nulls(): case
+        return self
+
     @staticmethod
     def _validate_dtype_instance(dtype: StructDtype) -> StructDtype:
         # IntervalDtype is a subclass of StructDtype, so compare types exactly
@@ -107,12 +118,9 @@ def memory_usage(self) -> int:
 
         return n
 
-    def element_indexing(self, index: int):
+    def element_indexing(self, index: int) -> dict:
         result = super().element_indexing(index)
-        return {
-            field: value
-            for field, value in zip(self.dtype.fields, result.values())
-        }
+        return dict(zip(self.dtype.fields, result.values()))
 
     def __setitem__(self, key, value):
         if isinstance(value, dict):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index f3a7916aa35..302178ea277 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -1,18 +1,20 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import datetime
 import functools
+import math
 from typing import TYPE_CHECKING, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 import cudf.core.column.column as column
-import cudf.core.column.string as string
 from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop, unary
 from cudf.core.buffer import Buffer, acquire_spill_lock
@@ -79,6 +81,8 @@ class TimeDeltaColumn(ColumnBase):
         "__rfloordiv__",
     }
 
+    _PANDAS_NA_REPR = str(pd.NaT)
+
     def __init__(
         self,
         data: Buffer,
@@ -262,7 +266,15 @@ def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
     def total_seconds(self) -> ColumnBase:
-        raise NotImplementedError("total_seconds is currently not implemented")
+        conversion = _unit_to_nanoseconds_conversion[self.time_unit] / 1e9
+        # Typecast to decimal128 to avoid floating point precision issues
+        # https://github.com/rapidsai/cudf/issues/17664
+        return (
+            (self.astype("int64") * conversion)
+            .astype(cudf.Decimal128Dtype(38, 9))
+            .round(decimals=abs(int(math.log10(conversion))))
+            .astype("float64")
+        )
 
     def ceil(self, freq: str) -> ColumnBase:
         raise NotImplementedError("ceil is currently not implemented")
@@ -294,12 +306,15 @@ def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
             return cast(
                 cudf.core.column.StringColumn,
-                column.column_empty(0, dtype="object", masked=False),
+                column.column_empty(0, dtype="object"),
             )
         else:
-            return string._timedelta_to_str_typecast_functions[self.dtype](
-                self, format=format
-            )
+            with acquire_spill_lock():
+                return type(self).from_pylibcudf(  # type: ignore[return-value]
+                    plc.strings.convert.convert_durations.from_durations(
+                        self.to_pylibcudf(mode="read"), format
+                    )
+                )
 
     def as_string_column(self) -> cudf.core.column.StringColumn:
         return self.strftime("%D days %H:%M:%S")
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index e4fd82e819b..aaf7d071dff 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -207,11 +207,16 @@ def _from_columns_like_self(
 
     @property
     def level_names(self) -> tuple[abc.Hashable, ...]:
+        if self.is_cached("to_pandas_index"):
+            return self.to_pandas_index.names
         if self._level_names is None or len(self._level_names) == 0:
             return tuple((None,) * max(1, self.nlevels))
         else:
             return self._level_names
 
+    def is_cached(self, attr_name: str) -> bool:
+        return attr_name in self.__dict__
+
     @property
     def nlevels(self) -> int:
         if len(self) == 0:
@@ -262,7 +267,12 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         new_ncols: int
             len(self) after self._data was modified
         """
-        cached_properties = ("columns", "names", "_grouped_data")
+        cached_properties = (
+            "columns",
+            "names",
+            "_grouped_data",
+            "to_pandas_index",
+        )
         for attr in cached_properties:
             try:
                 self.__delattr__(attr)
@@ -276,6 +286,7 @@ def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
             except AttributeError:
                 pass
 
+    @cached_property
     def to_pandas_index(self) -> pd.Index:
         """Convert the keys of the ColumnAccessor to a Pandas Index object."""
         if self.multiindex and len(self.level_names) > 0:
@@ -726,10 +737,10 @@ def droplevel(self, level: int) -> None:
         }
         new_ncols = len(self)
         self._level_names = (
-            self._level_names[:level] + self._level_names[level + 1 :]
+            self.level_names[:level] + self.level_names[level + 1 :]
         )
 
-        if len(self._level_names) == 1:
+        if len(self.level_names) == 1:
             # can't use nlevels, as it depends on multiindex
             self.multiindex = False
         self._clear_cache(old_ncols, new_ncols)
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 16d8964f083..4b6ad59c8e1 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -5,7 +5,6 @@
 from typing_extensions import Self
 
 import cudf
-import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 
 if TYPE_CHECKING:
@@ -70,8 +69,8 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool):
             if self.column.dtype.kind not in {"i", "u"}:
                 raise TypeError("Gather map must have integer dtype")
             if not nullify:
-                lo, hi = libcudf.reduce.minmax(self.column)
-                if lo.value < -nrows or hi.value >= nrows:
+                lo, hi = self.column.minmax()
+                if lo < -nrows or hi >= nrows:
                     raise IndexError(
                         f"Gather map is out of bounds for [0, {nrows})"
                     )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 656274bca38..5cea35ac0d6 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -7,7 +7,6 @@
 import itertools
 import numbers
 import os
-import pickle
 import re
 import sys
 import textwrap
@@ -50,7 +49,6 @@
 )
 from cudf.core import column, df_protocol, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column import (
     CategoricalColumn,
@@ -94,7 +92,11 @@
     min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
+from cudf.utils.utils import (
+    GetAttrGetItemMixin,
+    _external_only_api,
+    _is_null_host_scalar,
+)
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnLike, Dtype, NotImplementedType
@@ -588,7 +590,7 @@ class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
     pass
 
 
-class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
+class DataFrame(IndexedFrame, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
 
@@ -776,9 +778,7 @@ def __init__(
                 label_dtype = getattr(columns, "dtype", None)
                 self._data = ColumnAccessor(
                     {
-                        k: column.column_empty(
-                            len(self), dtype="object", masked=True
-                        )
+                        k: column_empty(len(self), dtype="object")
                         for k in columns
                     },
                     level_names=tuple(columns.names)
@@ -965,7 +965,7 @@ def _init_from_series_list(self, data, columns, index):
                 warnings.simplefilter("ignore", FutureWarning)
                 concat_df = cudf.concat(data, axis=1)
 
-            cols = concat_df._data.to_pandas_index()
+            cols = concat_df._data.to_pandas_index
             if cols.dtype == "object":
                 concat_df.columns = cols.astype("str")
 
@@ -981,8 +981,8 @@ def _init_from_series_list(self, data, columns, index):
         if columns is not None:
             for col_name in columns:
                 if col_name not in self._data:
-                    self._data[col_name] = column.column_empty(
-                        row_count=len(self), dtype=None, masked=True
+                    self._data[col_name] = column_empty(
+                        row_count=len(self), dtype=None
                     )
             self._data._level_names = (
                 tuple(columns.names)
@@ -1033,11 +1033,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             data = list(itertools.zip_longest(*data))
 
             if columns is not None and len(data) == 0:
-                data = [
-                    cudf.core.column.column_empty(row_count=0, dtype=None)
-                    for _ in columns
-                ]
-
+                data = [column_empty(row_count=0, dtype=None) for _ in columns]
             for col_name, col in enumerate(data):
                 self._data[col_name] = column.as_column(col)
             self._data.rangeindex = True
@@ -1076,9 +1072,8 @@ def _init_from_dict_like(
                 # the provided index, so we need to return a masked
                 # array of nulls if an index is given.
                 empty_column = functools.partial(
-                    cudf.core.column.column_empty,
-                    row_count=(0 if index is None else len(index)),
-                    masked=index is not None,
+                    column_empty,
+                    row_count=0 if index is None else len(index),
                 )
 
             data = {
@@ -1190,7 +1185,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1205,8 +1200,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
@@ -1424,7 +1418,7 @@ def __setitem__(self, arg, value):
                         new_columns = (
                             value
                             if key == arg
-                            else column.column_empty(
+                            else column_empty(
                                 row_count=length, dtype=col.dtype
                             )
                             for key, col in self._column_labels_and_values
@@ -1812,13 +1806,37 @@ def _concat(
                 )
                 for table in tables
             ]
-
-            concatted = libcudf.utils.data_from_pylibcudf_table(
-                plc.concatenate.concatenate(plc_tables),
-                column_names=column_names,
-                index_names=index_names,
-            )
-        out = cls._from_data(*concatted)
+            plc_result = plc.concatenate.concatenate(plc_tables)
+            if ignore:
+                index = None
+                data = {
+                    col_name: ColumnBase.from_pylibcudf(col)
+                    for col_name, col in zip(
+                        column_names, plc_result.columns(), strict=True
+                    )
+                }
+            else:
+                result_columns = [
+                    ColumnBase.from_pylibcudf(col)
+                    for col in plc_result.columns()
+                ]
+                index = _index_from_data(
+                    dict(
+                        zip(
+                            index_names,
+                            result_columns[: len(index_names)],
+                            strict=True,
+                        )
+                    )
+                )
+                data = dict(
+                    zip(
+                        column_names,
+                        result_columns[len(index_names) :],
+                        strict=True,
+                    )
+                )
+        out = cls._from_data(data=data, index=index)
 
         # If ignore_index is True, all input frames are empty, and at
         # least one input frame has an index, assign a new RangeIndex
@@ -1876,7 +1894,7 @@ def astype(
             dtype = {cc: dtype for cc in self._column_names}
         return super().astype(dtype, copy, errors)
 
-    def _clean_renderable_dataframe(self, output):
+    def _clean_renderable_dataframe(self, output: Self) -> str:
         """
         This method takes in partial/preprocessed dataframe
         and returns correct representation of it with correct
@@ -1911,41 +1929,7 @@ def _clean_renderable_dataframe(self, output):
             )
         return "\n".join(lines)
 
-    def _clean_nulls_from_dataframe(self, df):
-        """
-        This function converts all ``null`` values to ``<NA>`` for
-        representation as a string in `__repr__`.
-
-        Since we utilize Pandas `__repr__` at all places in our code
-        for formatting purposes, we convert columns to `str` dtype for
-        filling with `<NA>` values.
-        """
-        for col in df._data:
-            if isinstance(
-                df._data[col].dtype, (cudf.StructDtype, cudf.ListDtype)
-            ):
-                # TODO we need to handle this
-                pass
-            elif df._data[col].has_nulls():
-                fill_value = (
-                    str(cudf.NaT)
-                    if isinstance(
-                        df._data[col],
-                        (
-                            cudf.core.column.DatetimeColumn,
-                            cudf.core.column.TimeDeltaColumn,
-                        ),
-                    )
-                    else str(cudf.NA)
-                )
-
-                df[col] = df._data[col].astype("str").fillna(fill_value)
-            else:
-                df[col] = df._data[col]
-
-        return df
-
-    def _get_renderable_dataframe(self):
+    def _get_renderable_dataframe(self) -> Self:
         """
         Takes rows and columns from pandas settings or estimation from size.
         pulls quadrants based off of some known parameters then style for
@@ -1953,9 +1937,9 @@ def _get_renderable_dataframe(self):
         for printing with the dataframe.
         """
         max_rows = pd.options.display.max_rows
-        nrows = np.max([len(self) if max_rows is None else max_rows, 1])
-        if pd.options.display.max_rows == 0:
-            nrows = len(self)
+        if max_rows in {0, None}:
+            max_rows = len(self)
+        nrows = max(max_rows, 1)
         ncols = (
             pd.options.display.max_columns
             if pd.options.display.max_columns
@@ -1963,7 +1947,7 @@ def _get_renderable_dataframe(self):
         )
 
         if len(self) <= nrows and self._num_columns <= ncols:
-            output = self.copy(deep=False)
+            output = self
         elif self.empty and len(self.index) > 0:
             max_seq_items = pd.options.display.max_seq_items
             # In case of Empty DataFrame with index, Pandas prints
@@ -2023,10 +2007,7 @@ def _get_renderable_dataframe(self):
                 lower = cudf.concat([lower_left, lower_right], axis=1)
                 output = cudf.concat([upper, lower])
 
-        output = self._clean_nulls_from_dataframe(output)
-        output.index = output.index._clean_nulls_from_index()
-
-        return output
+        return output._pandas_repr_compatible()
 
     @_performance_tracking
     def __repr__(self):
@@ -2078,7 +2059,7 @@ def _make_operands_and_index_for_binop(
             equal_columns = True
         elif isinstance(other, Series):
             if (
-                not (self_pd_columns := self._data.to_pandas_index()).equals(
+                not (self_pd_columns := self._data.to_pandas_index).equals(
                     other_pd_index := other.index.to_pandas()
                 )
                 and not can_reindex
@@ -2103,8 +2084,8 @@ def _make_operands_and_index_for_binop(
                 and fn in cudf.utils.utils._EQUALITY_OPS
                 and (
                     not self.index.equals(other.index)
-                    or not self._data.to_pandas_index().equals(
-                        other._data.to_pandas_index()
+                    or not self._data.to_pandas_index.equals(
+                        other._data.to_pandas_index
                     )
                 )
             ):
@@ -2148,11 +2129,11 @@ def _make_operands_and_index_for_binop(
 
         if not equal_columns:
             if isinstance(other, DataFrame):
-                column_names_list = self._data.to_pandas_index().join(
-                    other._data.to_pandas_index(), how="outer"
+                column_names_list = self._data.to_pandas_index.join(
+                    other._data.to_pandas_index, how="outer"
                 )
             elif isinstance(other, Series):
-                column_names_list = self._data.to_pandas_index().join(
+                column_names_list = self._data.to_pandas_index.join(
                     other.index.to_pandas(), how="outer"
                 )
             else:
@@ -2508,16 +2489,7 @@ def scatter_by_map(
                 )
 
             if map_index.size > 0:
-                plc_lo, plc_hi = plc.reduce.minmax(
-                    map_index.to_pylibcudf(mode="read")
-                )
-                # TODO: Use pylibcudf Scalar once APIs are more developed
-                lo = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_lo, 1)
-                ).element_indexing(0)
-                hi = libcudf.column.Column.from_pylibcudf(
-                    plc.Column.from_scalar(plc_hi, 1)
-                ).element_indexing(0)
+                lo, hi = map_index.minmax()
                 if lo < 0 or hi >= map_size:
                     raise ValueError("Partition map has invalid values")
 
@@ -2621,8 +2593,8 @@ def update(
         if not isinstance(other, DataFrame):
             other = DataFrame(other)
 
-        self_cols = self._data.to_pandas_index()
-        if not self_cols.equals(other._data.to_pandas_index()):
+        self_cols = self._data.to_pandas_index
+        if not self_cols.equals(other._data.to_pandas_index):
             other = other.reindex(self_cols, axis=1)
         if not self.index.equals(other.index):
             other = other.reindex(self.index, axis=0)
@@ -2658,7 +2630,7 @@ def __iter__(self):
     def __contains__(self, item):
         # This must check against containment in the pandas Index and not
         # self._column_names to handle NA, None, nan, etc. correctly.
-        return item in self._data.to_pandas_index()
+        return item in self._data.to_pandas_index
 
     @_performance_tracking
     def items(self):
@@ -2695,14 +2667,14 @@ def at(self):
 
     @property  # type: ignore
     @_external_only_api(
-        "Use _column_names instead, or _data.to_pandas_index() if a pandas "
+        "Use _column_names instead, or _data.to_pandas_index if a pandas "
         "index is absolutely necessary. For checking if the columns are a "
         "MultiIndex, use _data.multiindex."
     )
     @_performance_tracking
     def columns(self):
         """Returns a tuple of columns"""
-        return self._data.to_pandas_index()
+        return self._data.to_pandas_index
 
     @columns.setter  # type: ignore
     @_performance_tracking
@@ -2911,7 +2883,7 @@ def reindex(
             df = self
         else:
             columns = cudf.Index(columns)
-            intersection = self._data.to_pandas_index().intersection(
+            intersection = self._data.to_pandas_index.intersection(
                 columns.to_pandas()
             )
             df = self.loc[:, intersection]
@@ -3191,10 +3163,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
             )
 
             if cond_col := cond._data.get(name):
-                result = cudf._lib.copying.copy_if_else(
-                    source_col, other_col, cond_col
-                )
-
+                result = source_col.copy_if_else(other_col, cond_col)
                 out.append(result._with_type_metadata(col.dtype))
             else:
                 out_mask = as_buffer(
@@ -3369,7 +3338,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             if isinstance(value, (np.ndarray, cupy.ndarray)):
                 dtype = value.dtype
                 value = value.item()
-            if libcudf.scalar._is_null_host_scalar(value):
+            if _is_null_host_scalar(value):
                 dtype = "str"
             value = as_column(
                 value,
@@ -3385,7 +3354,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
                         (
-                            column.column_empty(row_count=length, dtype=dtype)
+                            column_empty(row_count=length, dtype=dtype)
                             for _, dtype in self._dtypes
                         ),
                         verify=False,
@@ -3428,7 +3397,7 @@ def axes(self):
             Index(['key', 'k2', 'val', 'temp'], dtype='object')]
 
         """
-        return [self.index, self._data.to_pandas_index()]
+        return [self.index, self._data.to_pandas_index]
 
     def diff(self, periods=1, axis=0):
         """
@@ -3491,7 +3460,7 @@ def diff(self, periods=1, axis=0):
         if abs(periods) > len(self):
             df = cudf.DataFrame._from_data(
                 {
-                    name: column_empty(len(self), dtype=dtype, masked=True)
+                    name: column_empty(len(self), dtype=dtype)
                     for name, dtype in zip(self._column_names, self.dtypes)
                 }
             )
@@ -3871,9 +3840,7 @@ def agg(self, aggs, axis=None):
                 result = DataFrame(index=idxs, columns=cols)
                 for key in aggs.keys():
                     col = self[key]
-                    col_empty = column_empty(
-                        len(idxs), dtype=col.dtype, masked=True
-                    )
+                    col_empty = column_empty(len(idxs), dtype=col.dtype)
                     ans = cudf.Series._from_column(
                         col_empty, index=cudf.Index(idxs)
                     )
@@ -4129,7 +4096,7 @@ def transpose(self):
             Not supporting *copy* because default and only behavior is
             copy=True
         """
-        index = self._data.to_pandas_index()
+        index = self._data.to_pandas_index
         columns = self.index.copy(deep=False)
         if self._num_columns == 0 or self._num_rows == 0:
             return DataFrame(index=index, columns=columns)
@@ -5535,7 +5502,7 @@ def to_pandas(
         }
 
         out_df = pd.DataFrame(out_data, index=out_index)
-        out_df.columns = self._data.to_pandas_index()
+        out_df.columns = self._data.to_pandas_index
 
         return out_df
 
@@ -6189,9 +6156,7 @@ def quantile(
                         quant_index=False,
                     )._column
                     if len(res) == 0:
-                        res = column.column_empty(
-                            row_count=len(qs), dtype=ser.dtype
-                        )
+                        res = column_empty(row_count=len(qs), dtype=ser.dtype)
                     result[k] = res
             result = DataFrame._from_data(result)
 
@@ -6264,10 +6229,8 @@ def isin(self, values):
         # TODO: propagate nulls through isin
         # https://github.com/rapidsai/cudf/issues/7556
 
-        fill_value = cudf.Scalar(False)
-
         def make_false_column_like_self():
-            return column.as_column(fill_value, length=len(self), dtype="bool")
+            return column.as_column(False, length=len(self), dtype="bool")
 
         # Preprocess different input types into a mapping from column names to
         # a list of values to check.
@@ -6489,7 +6452,7 @@ def _reduce(
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
                 return Series(
-                    index=self._data.to_pandas_index()[:0]
+                    index=self._data.to_pandas_index[:0]
                     if axis == 0
                     else source.index,
                     dtype="float64",
@@ -6542,7 +6505,7 @@ def _reduce(
                         "Columns must all have the same dtype to "
                         f"perform {op=} with {axis=}"
                     )
-                pd_index = source._data.to_pandas_index()
+                pd_index = source._data.to_pandas_index
                 if source._data.multiindex:
                     idx = MultiIndex.from_pandas(pd_index)
                 else:
@@ -6772,9 +6735,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             )
             result = column.as_column(result, dtype=result_dtype)
             if mask is not None:
-                result = result.set_mask(
-                    cudf._lib.transform.bools_to_mask(mask._column)
-                )
+                result = result.set_mask(mask._column.as_mask())
             return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result, index=self.index)
@@ -7246,7 +7207,7 @@ def stack(
         ]
         has_unnamed_levels = len(unnamed_levels_indices) > 0
 
-        column_name_idx = self._data.to_pandas_index()
+        column_name_idx = self._data.to_pandas_index
         # Construct new index from the levels specified by `level`
         named_levels = pd.MultiIndex.from_arrays(
             [column_name_idx.get_level_values(lv) for lv in level_indices]
@@ -7347,9 +7308,7 @@ def unnamed_group_generator():
             )
 
             all_nulls = functools.cache(
-                functools.partial(
-                    column_empty, self.shape[0], common_type, masked=True
-                )
+                functools.partial(column_empty, self.shape[0], common_type)
             )
 
             # homogenize the dtypes of the columns
@@ -7438,7 +7397,7 @@ def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
             )
 
         cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
-        cols = self._data.to_pandas_index()
+        cols = self._data.to_pandas_index
         df = DataFrame(cupy.asfortranarray(cov), index=cols)
         df._set_columns_like(self._data)
         return df
@@ -7481,7 +7440,7 @@ def corr(
             )
 
         corr = cupy.corrcoef(values, rowvar=False)
-        cols = self._data.to_pandas_index()
+        cols = self._data.to_pandas_index
         df = DataFrame(cupy.asfortranarray(corr), index=cols)
         df._set_columns_like(self._data)
         return df
@@ -7550,7 +7509,7 @@ def keys(self):
         >>> df.keys()
         Index([0, 1, 2, 3], dtype='int64')
         """
-        return self._data.to_pandas_index()
+        return self._data.to_pandas_index
 
     def itertuples(self, index=True, name="Pandas"):
         """
@@ -7784,7 +7743,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series:
             raise NotImplementedError("axis parameter is not supported yet.")
         counts = [col.distinct_count(dropna=dropna) for col in self._columns]
         return self._constructor_sliced(
-            counts, index=self._data.to_pandas_index()
+            counts, index=self._data.to_pandas_index
         )
 
     def _sample_axis_1(
@@ -7883,6 +7842,17 @@ def interleave_columns(self):
             )
         return self._constructor_sliced._from_column(result_col)
 
+    @acquire_spill_lock()
+    def _compute_column(self, expr: str) -> ColumnBase:
+        """Helper function for eval"""
+        plc_column = plc.transform.compute_column(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in self._columns]
+            ),
+            plc.expressions.to_expression(expr, self._column_names),
+        )
+        return libcudf.column.Column.from_pylibcudf(plc_column)
+
     @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
@@ -8010,11 +7980,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_column(
-                libcudf.transform.compute_column(
-                    [*self._columns], self._column_names, statements[0]
-                )
-            )
+            return Series._from_column(self._compute_column(statements[0]))
 
         targets = []
         exprs = []
@@ -8030,15 +7996,9 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
             targets.append(t.strip())
             exprs.append(e.strip())
 
-        cols = (
-            libcudf.transform.compute_column(
-                [*self._columns], self._column_names, e
-            )
-            for e in exprs
-        )
         ret = self if inplace else self.copy(deep=False)
-        for name, col in zip(targets, cols):
-            ret._data[name] = col
+        for name, expr in zip(targets, exprs):
+            ret._data[name] = self._compute_column(expr)
         if not inplace:
             return ret
 
@@ -8596,7 +8556,7 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
             # If column not in this df, fill with an all-null column
             if idx >= len(cols) or cols[idx] is None:
                 n = len(next(x for x in cols if x is not None))
-                cols[idx] = column_empty(row_count=n, dtype=dtype, masked=True)
+                cols[idx] = column_empty(row_count=n, dtype=dtype)
             else:
                 # If column is categorical, rebase the codes with the
                 # combined categories, and cast the new codes to the
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index aa601a2b322..a798041699e 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -799,8 +799,7 @@ def _set_missing_values(
             valid_mask = _ensure_gpu_buffer(
                 valid_mask[0], valid_mask[1], allow_copy
             )
-            boolmask = as_column(valid_mask._buf, dtype="bool")
-            bitmask = cudf._lib.transform.bools_to_mask(boolmask)
+            bitmask = as_column(valid_mask._buf, dtype="bool").as_mask()
             return cudf_col.set_mask(bitmask)
         elif null == _MaskKind.BITMASK:
             valid_mask = _ensure_gpu_buffer(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 801020664da..8ed233ba737 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -3,7 +3,6 @@
 
 import decimal
 import operator
-import pickle
 import textwrap
 import warnings
 from functools import cached_property
@@ -57,7 +56,9 @@ def dtype(arbitrary):
     else:
         if np_dtype.kind in set("OU"):
             return np.dtype("object")
-        elif np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
+        elif (
+            np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
+        ):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
@@ -91,13 +92,13 @@ def dtype(arbitrary):
         raise TypeError(f"Cannot interpret {arbitrary} as a valid cuDF dtype")
 
 
-def _decode_type(
+def _check_type(
     cls: type,
     header: dict,
     frames: list,
     is_valid_class: Callable[[type, type], bool] = operator.is_,
-) -> tuple[dict, list, type]:
-    """Decode metadata-encoded type and check validity
+) -> None:
+    """Perform metadata-encoded type and check validity
 
     Parameters
     ----------
@@ -112,12 +113,6 @@ class performing deserialization
         serialization by `cls` (default is to check type equality), called
         as `is_valid_class(decoded_class, cls)`.
 
-    Returns
-    -------
-    tuple
-        Tuple of validated headers, frames, and the decoded class
-        constructor.
-
     Raises
     ------
     AssertionError
@@ -128,11 +123,11 @@ class performing deserialization
         f"Deserialization expected {header['frame_count']} frames, "
         f"but received {len(frames)}."
     )
-    klass = pickle.loads(header["type-serialized"])
+    klass = Serializable._name_type_map[header["type-serialized-name"]]
     assert is_valid_class(
-        klass, cls
+        klass,
+        cls,
     ), f"Header-encoded {klass=} does not match decoding {cls=}."
-    return header, frames, klass
 
 
 class _BaseDtype(ExtensionDtype, Serializable):
@@ -196,9 +191,7 @@ def categories(self) -> cudf.Index:
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            col = cudf.core.column.column_empty(
-                0, dtype="object", masked=False
-            )
+            col = cudf.core.column.column_empty(0, dtype="object")
         else:
             col = self._categories
         return cudf.Index._from_column(col)
@@ -305,13 +298,14 @@ def construct_from_string(self):
 
     def serialize(self):
         header = {}
-        header["type-serialized"] = pickle.dumps(type(self))
         header["ordered"] = self.ordered
 
         frames = []
 
         if self.categories is not None:
-            categories_header, categories_frames = self.categories.serialize()
+            categories_header, categories_frames = (
+                self.categories.device_serialize()
+            )
         header["categories"] = categories_header
         frames.extend(categories_frames)
         header["frame_count"] = len(frames)
@@ -319,15 +313,14 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         ordered = header["ordered"]
         categories_header = header["categories"]
         categories_frames = frames
-        categories_type = pickle.loads(categories_header["type-serialized"])
-        categories = categories_type.deserialize(
+        categories = Serializable.device_deserialize(
             categories_header, categories_frames
         )
-        return klass(categories=categories, ordered=ordered)
+        return cls(categories=categories, ordered=ordered)
 
     def __repr__(self):
         return self.to_pandas().__repr__()
@@ -495,12 +488,13 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Dtype] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames = []
 
         if isinstance(self.element_type, _BaseDtype):
-            header["element-type"], frames = self.element_type.serialize()
+            header["element-type"], frames = (
+                self.element_type.device_serialize()
+            )
         else:
             header["element-type"] = getattr(
                 self.element_type, "name", self.element_type
@@ -510,14 +504,14 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         if isinstance(header["element-type"], dict):
-            element_type = pickle.loads(
-                header["element-type"]["type-serialized"]
-            ).deserialize(header["element-type"], frames)
+            element_type = Serializable.device_deserialize(
+                header["element-type"], frames
+            )
         else:
             element_type = header["element-type"]
-        return klass(element_type=element_type)
+        return cls(element_type=element_type)
 
     @cached_property
     def itemsize(self):
@@ -641,7 +635,6 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header: dict[str, Any] = {}
-        header["type-serialized"] = pickle.dumps(type(self))
 
         frames: list[Buffer] = []
 
@@ -649,33 +642,31 @@ def serialize(self) -> tuple[dict, list]:
 
         for k, dtype in self.fields.items():
             if isinstance(dtype, _BaseDtype):
-                dtype_header, dtype_frames = dtype.serialize()
+                dtype_header, dtype_frames = dtype.device_serialize()
                 fields[k] = (
                     dtype_header,
                     (len(frames), len(frames) + len(dtype_frames)),
                 )
                 frames.extend(dtype_frames)
             else:
-                fields[k] = pickle.dumps(dtype)
+                fields[k] = dtype.str
         header["fields"] = fields
         header["frame_count"] = len(frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
+        _check_type(cls, header, frames)
         fields = {}
         for k, dtype in header["fields"].items():
             if isinstance(dtype, tuple):
                 dtype_header, (start, stop) = dtype
-                fields[k] = pickle.loads(
-                    dtype_header["type-serialized"]
-                ).deserialize(
+                fields[k] = Serializable.device_deserialize(
                     dtype_header,
                     frames[start:stop],
                 )
             else:
-                fields[k] = pickle.loads(dtype)
+                fields[k] = np.dtype(dtype)
         return cls(fields)
 
     @cached_property
@@ -838,7 +829,6 @@ def _from_decimal(cls, decimal):
     def serialize(self) -> tuple[dict, list]:
         return (
             {
-                "type-serialized": pickle.dumps(type(self)),
                 "precision": self.precision,
                 "scale": self.scale,
                 "frame_count": 0,
@@ -848,11 +838,8 @@ def serialize(self) -> tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(
-            cls, header, frames, is_valid_class=issubclass
-        )
-        klass = pickle.loads(header["type-serialized"])
-        return klass(header["precision"], header["scale"])
+        _check_type(cls, header, frames, is_valid_class=issubclass)
+        return cls(header["precision"], header["scale"])
 
     def __eq__(self, other: Dtype) -> bool:
         if other is self:
@@ -960,18 +947,17 @@ def __hash__(self):
 
     def serialize(self) -> tuple[dict, list]:
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "fields": pickle.dumps((self.subtype, self.closed)),
+            "fields": (self.subtype.str, self.closed),
             "frame_count": 0,
         }
         return header, []
 
     @classmethod
     def deserialize(cls, header: dict, frames: list):
-        header, frames, klass = _decode_type(cls, header, frames)
-        klass = pickle.loads(header["type-serialized"])
-        subtype, closed = pickle.loads(header["fields"])
-        return klass(subtype, closed=closed)
+        _check_type(cls, header, frames)
+        subtype, closed = header["fields"]
+        subtype = np.dtype(subtype)
+        return cls(subtype, closed=closed)
 
 
 def _is_categorical_dtype(obj):
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 0a7e6fefe6e..abf9f7b3686 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1,9 +1,8 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections import abc
 from typing import TYPE_CHECKING, Any, Literal
@@ -23,7 +22,9 @@
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying, sorting
 from cudf.core._internals.search import search_sorted
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -47,7 +48,7 @@
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
-class Frame(BinaryOperand, Scannable):
+class Frame(BinaryOperand, Scannable, Serializable):
     """A collection of Column objects with an optional index.
 
     Parameters
@@ -97,37 +98,80 @@ def ndim(self) -> int:
     @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
+        frames = []
         header = {
-            "type-serialized": pickle.dumps(type(self)),
-            "column_names": pickle.dumps(self._column_names),
-            "column_rangeindex": pickle.dumps(self._data.rangeindex),
-            "column_multiindex": pickle.dumps(self._data.multiindex),
-            "column_label_dtype": pickle.dumps(self._data.label_dtype),
-            "column_level_names": pickle.dumps(self._data._level_names),
+            "column_label_dtype": None,
+            "dtype-is-cudf-serialized": False,
         }
-        header["columns"], frames = serialize_columns(self._columns)
+        if (label_dtype := self._data.label_dtype) is not None:
+            try:
+                header["column_label_dtype"], frames = (
+                    label_dtype.device_serialize()
+                )
+                header["dtype-is-cudf-serialized"] = True
+            except AttributeError:
+                header["column_label_dtype"] = label_dtype.str
+
+        header["columns"], column_frames = serialize_columns(self._columns)
+        column_names, column_names_numpy_type = (
+            zip(
+                *[
+                    (cname.item(), type(cname).__name__)
+                    if isinstance(cname, np.generic)
+                    else (cname, "")
+                    for cname in self._column_names
+                ]
+            )
+            if self._column_names
+            else ((), ())
+        )
+        header |= {
+            "column_names": column_names,
+            "column_names_numpy_type": column_names_numpy_type,
+            "column_rangeindex": self._data.rangeindex,
+            "column_multiindex": self._data.multiindex,
+            "column_level_names": self._data._level_names,
+        }
+        frames.extend(column_frames)
+
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
-        cls_deserialize = pickle.loads(header["type-serialized"])
-        column_names = pickle.loads(header["column_names"])
-        columns = deserialize_columns(header["columns"], frames)
         kwargs = {}
+        dtype_header = header["column_label_dtype"]
+        if header["dtype-is-cudf-serialized"]:
+            count = dtype_header["frame_count"]
+            kwargs["label_dtype"] = cls.device_deserialize(
+                header, frames[:count]
+            )
+            frames = frames[count:]
+        else:
+            kwargs["label_dtype"] = (
+                np.dtype(dtype_header) if dtype_header is not None else None
+            )
+
+        columns = deserialize_columns(header["columns"], frames)
         for metadata in [
             "rangeindex",
             "multiindex",
-            "label_dtype",
             "level_names",
         ]:
             key = f"column_{metadata}"
             if key in header:
-                kwargs[metadata] = pickle.loads(header[key])
+                kwargs[metadata] = header[key]
+
+        column_names = [
+            getattr(np, cntype)(cname) if cntype != "" else cname
+            for cname, cntype in zip(
+                header["column_names"], header["column_names_numpy_type"]
+            )
+        ]
         col_accessor = ColumnAccessor(
             data=dict(zip(column_names, columns)), **kwargs
         )
-        return cls_deserialize._from_data(col_accessor)
+        return cls._from_data(col_accessor)
 
     @classmethod
     @_performance_tracking
@@ -776,6 +820,13 @@ def fillna(
             inplace=inplace,
         )
 
+    def _pandas_repr_compatible(self) -> Self:
+        """Return Self but with columns prepared for a pandas-like repr."""
+        columns = (col._prep_pandas_compat_repr() for col in self._columns)
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(columns, verify=False)
+        )
+
     @_performance_tracking
     def _drop_column(
         self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
@@ -817,7 +868,9 @@ def _quantile_table(
                 column_order,
                 null_precedence,
             )
-            columns = libcudf.utils.columns_from_pylibcudf_table(plc_table)
+            columns = [
+                ColumnBase.from_pylibcudf(col) for col in plc_table.columns()
+            ]
         return self._from_columns_like_self(
             columns,
             column_names=self._column_names,
@@ -902,16 +955,17 @@ def from_arrow(cls, data: pa.Table) -> Self:
         if len(dict_indices):
             dict_indices_table = pa.table(dict_indices)
             data = data.drop(dict_indices_table.column_names)
-            indices_columns = libcudf.interop.from_arrow(dict_indices_table)
+            plc_indices = plc.interop.from_arrow(dict_indices_table)
             # as dictionary size can vary, it can't be a single table
             cudf_dictionaries_columns = {
                 name: ColumnBase.from_arrow(dict_dictionaries[name])
                 for name in dict_dictionaries.keys()
             }
 
-            for name, codes in zip(
-                dict_indices_table.column_names, indices_columns
+            for name, plc_codes in zip(
+                dict_indices_table.column_names, plc_indices.columns()
             ):
+                codes = libcudf.column.Column.from_pylibcudf(plc_codes)
                 categories = cudf_dictionaries_columns[name]
                 codes = as_unsigned_codes(len(categories), codes)
                 cudf_category_frame[name] = CategoricalColumn(
@@ -927,9 +981,9 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
-            name: col
-            for name, col in zip(
-                data.column_names, libcudf.interop.from_arrow(data)
+            name: libcudf.column.Column.from_pylibcudf(plc_col)
+            for name, plc_col in zip(
+                data.column_names, plc.interop.from_arrow(data).columns()
             )
         }
 
@@ -988,7 +1042,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
         return cls._from_data({name: result[name] for name in column_names})
 
     @_performance_tracking
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Table:
         """
         Convert to arrow Table
 
@@ -1014,19 +1068,6 @@ def to_arrow(self):
             }
         )
 
-    @_performance_tracking
-    def _positions_from_column_names(self, column_names) -> list[int]:
-        """Map each column name into their positions in the frame.
-
-        The order of indices returned corresponds to the column order in this
-        Frame.
-        """
-        return [
-            i
-            for i, name in enumerate(self._column_names)
-            if name in set(column_names)
-        ]
-
     @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
@@ -1433,7 +1474,7 @@ def _get_sorted_inds(
         else:
             ascending_lst = list(ascending)
 
-        return libcudf.sort.order_by(
+        return sorting.order_by(
             list(to_sort),
             ascending_lst,
             na_position,
@@ -1441,23 +1482,25 @@ def _get_sorted_inds(
         )
 
     @_performance_tracking
-    def _split(self, splits):
+    def _split(self, splits: list[int]) -> list[Self]:
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
         """
         return [
-            self._from_columns_like_self(
-                libcudf.copying.columns_split(list(self._columns), splits)[
-                    split_idx
-                ],
-                self._column_names,
-            )
-            for split_idx in range(len(splits) + 1)
+            self._from_columns_like_self(split, self._column_names)
+            for split in copying.columns_split(self._columns, splits)
         ]
 
     @_performance_tracking
     def _encode(self):
-        columns, indices = libcudf.transform.table_encode(list(self._columns))
+        plc_table, plc_column = plc.transform.encode(
+            plc.Table([col.to_pylibcudf(mode="read") for col in self._columns])
+        )
+        columns = [
+            libcudf.column.Column.from_pylibcudf(col)
+            for col in plc_table.columns()
+        ]
+        indices = libcudf.column.Column.from_pylibcudf(plc_column)
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 0f12f266a95..17302311a7e 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,43 +1,57 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import copy
+import functools
 import itertools
-import pickle
 import textwrap
+import types
 import warnings
 from collections import abc
-from functools import cached_property
+from functools import cached_property, singledispatch
 from typing import TYPE_CHECKING, Any, Literal
 
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 
 import pylibcudf as plc
 
 import cudf
+import cudf.core._internals
 from cudf import _lib as libcudf
-from cudf._lib import groupby as libgroupby
-from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_list_like, is_numeric_dtype
+from cudf.api.types import (
+    is_list_like,
+    is_numeric_dtype,
+    is_string_dtype,
+)
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import aggregation, sorting
 from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column.column import ColumnBase, StructDtype, as_column
+from cudf.core.column.column import ColumnBase, as_column, column_empty
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import GatherMap
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    DecimalDtype,
+    IntervalDtype,
+    ListDtype,
+    StructDtype,
+)
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
+from cudf.utils.dtypes import cudf_dtype_to_pa_type
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable
+    from collections.abc import Generator, Hashable, Iterable
 
     from cudf._typing import (
         AggType,
@@ -46,6 +60,152 @@
         ScalarLike,
     )
 
+# The sets below define the possible aggregations that can be performed on
+# different dtypes. These strings must be elements of the AggregationKind enum.
+# The libcudf infrastructure exists for "COLLECT" support on
+# categoricals, but the dtype support in python does not.
+_CATEGORICAL_AGGS = {"COUNT", "NUNIQUE", "SIZE", "UNIQUE"}
+_STRING_AGGS = {
+    "COLLECT",
+    "COUNT",
+    "MAX",
+    "MIN",
+    "NTH",
+    "NUNIQUE",
+    "SIZE",
+    "UNIQUE",
+}
+_LIST_AGGS = {"COLLECT"}
+_STRUCT_AGGS = {"COLLECT", "CORRELATION", "COVARIANCE"}
+_INTERVAL_AGGS = {"COLLECT"}
+_DECIMAL_AGGS = {
+    "ARGMIN",
+    "ARGMAX",
+    "COLLECT",
+    "COUNT",
+    "MAX",
+    "MIN",
+    "NTH",
+    "NUNIQUE",
+    "SUM",
+}
+
+
+@singledispatch
+def get_valid_aggregation(dtype):
+    if is_string_dtype(dtype):
+        return _STRING_AGGS
+    return "ALL"
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: CategoricalDtype):
+    return _CATEGORICAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: ListDtype):
+    return _LIST_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: StructDtype):
+    return _STRUCT_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: IntervalDtype):
+    return _INTERVAL_AGGS
+
+
+@get_valid_aggregation.register
+def _(dtype: DecimalDtype):
+    return _DECIMAL_AGGS
+
+
+@singledispatch
+def _is_unsupported_agg_for_type(dtype, str_agg: str) -> bool:
+    return False
+
+
+@_is_unsupported_agg_for_type.register
+def _(dtype: np.dtype, str_agg: str) -> bool:
+    # string specifically
+    cumulative_agg = str_agg in {"cumsum", "cummin", "cummax"}
+    basic_agg = any(
+        a in str_agg
+        for a in (
+            "count",
+            "max",
+            "min",
+            "first",
+            "last",
+            "nunique",
+            "unique",
+            "nth",
+        )
+    )
+    return (
+        dtype.kind == "O"
+        and str_agg not in _STRING_AGGS
+        and (cumulative_agg or not (basic_agg or str_agg == "<class 'list'>"))
+    )
+
+
+@_is_unsupported_agg_for_type.register
+def _(dtype: CategoricalDtype, str_agg: str) -> bool:
+    cumulative_agg = str_agg in {"cumsum", "cummin", "cummax"}
+    not_basic_agg = not any(
+        a in str_agg for a in ("count", "max", "min", "unique")
+    )
+    return str_agg not in _CATEGORICAL_AGGS and (
+        cumulative_agg or not_basic_agg
+    )
+
+
+def _is_all_scan_aggregate(all_aggs: list[list[str]]) -> bool:
+    """
+    Returns True if all are scan aggregations.
+
+    Raises
+    ------
+    NotImplementedError
+        If both reduction aggregations and scan aggregations are present.
+    """
+    groupby_scans = {
+        "cumcount",
+        "cumsum",
+        "cummin",
+        "cummax",
+        "cumprod",
+        "rank",
+    }
+
+    def get_name(agg):
+        return agg.__name__ if callable(agg) else agg
+
+    all_scan = all(
+        get_name(agg_name) in groupby_scans
+        for aggs in all_aggs
+        for agg_name in aggs
+    )
+    any_scan = any(
+        get_name(agg_name) in groupby_scans
+        for aggs in all_aggs
+        for agg_name in aggs
+    )
+
+    if not all_scan and any_scan:
+        raise NotImplementedError(
+            "Cannot perform both aggregation and scan in one operation"
+        )
+    return all_scan and any_scan
+
 
 def _deprecate_collect():
     warnings.warn(
@@ -261,6 +421,7 @@ class GroupBy(Serializable, Reducible, Scannable):
 
     _VALID_SCANS = {
         "cumsum",
+        "cumprod",
         "cummin",
         "cummax",
     }
@@ -268,6 +429,7 @@ class GroupBy(Serializable, Reducible, Scannable):
     # Necessary because the function names don't directly map to the docs.
     _SCAN_DOCSTRINGS = {
         "cumsum": {"op_name": "Cumulative sum"},
+        "cumprod": {"op_name": "Cumulative product"},
         "cummin": {"op_name": "Cumulative min"},
         "cummax": {"op_name": "Cumulative max"},
     }
@@ -423,7 +585,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         >>> df.groupby(by=["a"]).indices
         {10: array([0, 1]), 40: array([2])}
         """
-        offsets, group_keys, (indices,) = self._groupby.groups(
+        offsets, group_keys, (indices,) = self._groups(
             [
                 cudf.core.column.as_column(
                     range(len(self.obj)), dtype=size_type_dtype
@@ -431,7 +593,9 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
             ]
         )
 
-        group_keys = libcudf.stream_compaction.drop_duplicates(group_keys)
+        group_keys = cudf.core._internals.stream_compaction.drop_duplicates(
+            group_keys
+        )
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
@@ -482,7 +646,7 @@ def get_group(self, name, obj=None):
                 "instead of ``gb.get_group(name, obj=df)``.",
                 FutureWarning,
             )
-        if is_list_like(self._by):
+        if is_list_like(self._by) and len(self._by) == 1:
             if isinstance(name, tuple) and len(name) == 1:
                 name = name[0]
             else:
@@ -494,9 +658,7 @@ def size(self):
         """
         Return the size of each group.
         """
-        col = cudf.core.column.column_empty(
-            len(self.obj), "int8", masked=False
-        )
+        col = cudf.core.column.column_empty(len(self.obj), "int8")
         result = (
             cudf.Series._from_column(col, name=getattr(self.obj, "name", None))
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
@@ -524,7 +686,8 @@ def cumcount(self, ascending: bool = True):
         return (
             cudf.Series._from_column(
                 cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
+                    len(self.obj),
+                    "int8",
                 ),
                 index=self.obj.index,
             )
@@ -581,11 +744,139 @@ def rank(x):
         return result
 
     @cached_property
-    def _groupby(self):
-        return libgroupby.GroupBy(
-            [*self.grouping.keys._columns], dropna=self._dropna
+    def _groupby(self) -> types.SimpleNamespace:
+        with acquire_spill_lock() as spill_lock:
+            plc_groupby = plc.groupby.GroupBy(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in self.grouping._key_columns
+                    ]
+                ),
+                plc.types.NullPolicy.EXCLUDE
+                if self._dropna
+                else plc.types.NullPolicy.INCLUDE,
+            )
+            # Do we need this because we just check _spill_locks in test_spillable_df_groupby?
+            return types.SimpleNamespace(
+                plc_groupby=plc_groupby, _spill_locks=spill_lock
+            )
+
+    def _groups(
+        self, values: Iterable[ColumnBase]
+    ) -> tuple[list[int], list[ColumnBase], list[ColumnBase]]:
+        plc_columns = [col.to_pylibcudf(mode="read") for col in values]
+        if not plc_columns:
+            plc_table = None
+        else:
+            plc_table = plc.Table(plc_columns)
+        offsets, grouped_keys, grouped_values = (
+            self._groupby.plc_groupby.get_groups(plc_table)
         )
 
+        return (
+            offsets,
+            [ColumnBase.from_pylibcudf(col) for col in grouped_keys.columns()],
+            (
+                [
+                    ColumnBase.from_pylibcudf(col)
+                    for col in grouped_values.columns()
+                ]
+                if grouped_values is not None
+                else []
+            ),
+        )
+
+    def _aggregate(
+        self, values: tuple[ColumnBase, ...], aggregations
+    ) -> tuple[
+        list[list[ColumnBase]],
+        list[ColumnBase],
+        list[list[tuple[str, str]]],
+    ]:
+        included_aggregations = []
+        column_included = []
+        requests = []
+        result_columns: list[list[ColumnBase]] = []
+        for i, (col, aggs) in enumerate(zip(values, aggregations)):
+            valid_aggregations = get_valid_aggregation(col.dtype)
+            included_aggregations_i = []
+            col_aggregations = []
+            for agg in aggs:
+                str_agg = str(agg)
+                if _is_unsupported_agg_for_type(col.dtype, str_agg):
+                    raise TypeError(
+                        f"{col.dtype} type does not support {agg} operations"
+                    )
+                agg_obj = aggregation.make_aggregation(agg)
+                if (
+                    valid_aggregations == "ALL"
+                    or agg_obj.kind in valid_aggregations
+                ):
+                    included_aggregations_i.append((agg, agg_obj.kind))
+                    col_aggregations.append(agg_obj.plc_obj)
+            included_aggregations.append(included_aggregations_i)
+            result_columns.append([])
+            if col_aggregations:
+                requests.append(
+                    plc.groupby.GroupByRequest(
+                        col.to_pylibcudf(mode="read"), col_aggregations
+                    )
+                )
+                column_included.append(i)
+
+        if not requests and any(len(v) > 0 for v in aggregations):
+            raise pd.errors.DataError(
+                "All requested aggregations are unsupported."
+            )
+
+        keys, results = (
+            self._groupby.plc_groupby.scan(requests)
+            if _is_all_scan_aggregate(aggregations)
+            else self._groupby.plc_groupby.aggregate(requests)
+        )
+
+        for i, result in zip(column_included, results):
+            result_columns[i] = [
+                ColumnBase.from_pylibcudf(col) for col in result.columns()
+            ]
+
+        return (
+            result_columns,
+            [ColumnBase.from_pylibcudf(key) for key in keys.columns()],
+            included_aggregations,
+        )
+
+    def _shift(
+        self, values: tuple[ColumnBase, ...], periods: int, fill_values: list
+    ) -> Generator[ColumnBase]:
+        _, shifts = self._groupby.plc_groupby.shift(
+            plc.table.Table([col.to_pylibcudf(mode="read") for col in values]),
+            [periods] * len(values),
+            [
+                plc.interop.from_arrow(
+                    pa.scalar(val, type=cudf_dtype_to_pa_type(col.dtype))
+                )
+                for val, col in zip(fill_values, values)
+            ],
+        )
+        return (ColumnBase.from_pylibcudf(col) for col in shifts.columns())
+
+    def _replace_nulls(
+        self, values: tuple[ColumnBase, ...], method: str
+    ) -> Generator[ColumnBase]:
+        _, replaced = self._groupby.plc_groupby.replace_nulls(
+            plc.Table([col.to_pylibcudf(mode="read") for col in values]),
+            [
+                plc.replace.ReplacePolicy.PRECEDING
+                if method == "ffill"
+                else plc.replace.ReplacePolicy.FOLLOWING
+            ]
+            * len(values),
+        )
+
+        return (ColumnBase.from_pylibcudf(col) for col in replaced.columns())
+
     @_performance_tracking
     def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
         """
@@ -701,7 +992,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
             result_columns,
             grouped_key_cols,
             included_aggregations,
-        ) = self._groupby.aggregate(columns, normalized_aggs)
+        ) = self._aggregate(columns, normalized_aggs)
 
         result_index = self.grouping.keys._from_columns_like_self(
             grouped_key_cols,
@@ -760,11 +1051,11 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
         else:
             if cudf.get_option(
                 "mode.pandas_compatible"
-            ) and not libgroupby._is_all_scan_aggregate(normalized_aggs):
+            ) and not _is_all_scan_aggregate(normalized_aggs):
                 # Even with `sort=False`, pandas guarantees that
                 # groupby preserves the order of rows within each group.
-                left_cols = list(self.grouping.keys.drop_duplicates()._columns)
-                right_cols = list(result_index._columns)
+                left_cols = self.grouping.keys.drop_duplicates()._columns
+                right_cols = result_index._columns
                 join_keys = [
                     _match_join_keys(lcol, rcol, "inner")
                     for lcol, rcol in zip(left_cols, right_cols)
@@ -794,7 +1085,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
                 # want, and right order is a matching gather map for
                 # the result table. Get the correct order by sorting
                 # the right gather map.
-                (right_order,) = libcudf.sort.sort_by_key(
+                (right_order,) = sorting.sort_by_key(
                     [right_order],
                     [left_order],
                     [True],
@@ -809,7 +1100,7 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
 
         if not self._as_index:
             result = result.reset_index()
-        if libgroupby._is_all_scan_aggregate(normalized_aggs):
+        if _is_all_scan_aggregate(normalized_aggs):
             # Scan aggregations return rows in original index order
             return self._mimic_pandas_order(result)
 
@@ -919,7 +1210,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
             # Can't use _mimic_pandas_order because we need to
             # subsample the gather map from the full input ordering,
             # rather than permuting the gather map of the output.
-            _, _, (ordering,) = self._groupby.groups(
+            _, _, (ordering,) = self._groups(
                 [as_column(range(0, len(self.obj)))]
             )
             # Invert permutation from original order to groups on the
@@ -1250,15 +1541,20 @@ def sample(
                 for off, size in zip(group_offsets, size_per_group):
                     rs.shuffle(indices[off : off + size])
             else:
-                rng = cp.random.default_rng(seed=random_state)
-                (indices,) = segmented_sort_by_key(
-                    [as_column(indices)],
-                    [as_column(rng.random(size=nrows))],
-                    as_column(group_offsets),
-                    [],
-                    [],
-                    stable=True,
+                keys = cp.random.default_rng(seed=random_state).random(
+                    size=nrows
                 )
+                with acquire_spill_lock():
+                    plc_table = plc.sorting.stable_segmented_sort_by_key(
+                        plc.Table(
+                            [as_column(indices).to_pylibcudf(mode="read")]
+                        ),
+                        plc.Table([as_column(keys).to_pylibcudf(mode="read")]),
+                        as_column(group_offsets).to_pylibcudf(mode="read"),
+                        [plc.types.Order.ASCENDING],
+                        [plc.types.NullOrder.AFTER],
+                    )
+                    indices = ColumnBase.from_pylibcudf(plc_table.columns()[0])
                 indices = cp.asarray(indices.data_array_view(mode="read"))
             # Which indices are we going to want?
             want = np.arange(samples_per_group.sum(), dtype=size_type_dtype)
@@ -1281,7 +1577,7 @@ def serialize(self):
 
         obj_header, obj_frames = self.obj.serialize()
         header["obj"] = obj_header
-        header["obj_type"] = pickle.dumps(type(self.obj))
+        header["obj_type_name"] = type(self.obj).__name__
         header["num_obj_frames"] = len(obj_frames)
         frames.extend(obj_frames)
 
@@ -1296,7 +1592,7 @@ def serialize(self):
     def deserialize(cls, header, frames):
         kwargs = header["kwargs"]
 
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
@@ -1306,8 +1602,8 @@ def deserialize(cls, header, frames):
         return cls(obj, grouping, **kwargs)
 
     def _grouped(self, *, include_groups: bool = True):
-        offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
-            [*self.obj.index._columns, *self.obj._columns]
+        offsets, grouped_key_cols, grouped_value_cols = self._groups(
+            itertools.chain(self.obj.index._columns, self.obj._columns)
         )
         grouped_keys = cudf.core.index._index_from_data(
             dict(enumerate(grouped_key_cols))
@@ -1939,7 +2235,7 @@ def transform(
                 "Currently, `transform()` supports only aggregations."
             ) from e
         # If the aggregation is a scan, don't broadcast
-        if libgroupby._is_all_scan_aggregate([[func]]):
+        if _is_all_scan_aggregate([[func]]):
             if len(result) != len(self.obj):
                 raise AssertionError(
                     "Unexpected result length for scan transform"
@@ -2156,7 +2452,7 @@ def _cov_or_corr(self, func, method_name):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be used in the correlation or covariance
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values._column_names
+        column_names = self.grouping._values_column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -2191,7 +2487,7 @@ def _cov_or_corr(self, func, method_name):
 
         column_pair_groupby = cudf.DataFrame._from_data(
             column_pair_structs
-        ).groupby(by=self.grouping.keys)
+        ).groupby(by=self.grouping)
 
         try:
             gb_cov_corr = column_pair_groupby.agg(func)
@@ -2390,10 +2686,8 @@ def diff(self, periods=1, axis=0):
 
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
-
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values - self.shift(periods=periods)
 
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
@@ -2403,7 +2697,7 @@ def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
             dict(
                 zip(
                     values._column_names,
-                    self._groupby.replace_nulls([*values._columns], method),
+                    self._replace_nulls(values._columns, method),
                 )
             )
         )
@@ -2497,9 +2791,8 @@ def fillna(
                 raise ValueError("Method can only be of 'ffill', 'bfill'.")
             return getattr(self, method, limit)()
 
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values.fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
@@ -2507,7 +2800,7 @@ def fillna(
     @_performance_tracking
     def shift(
         self,
-        periods=1,
+        periods: int = 1,
         freq=None,
         axis=0,
         fill_value=None,
@@ -2554,7 +2847,7 @@ def shift(
         if freq is not None:
             raise NotImplementedError("Parameter freq is unsupported.")
 
-        if not axis == 0:
+        if axis != 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
         if suffix is not None:
@@ -2562,20 +2855,18 @@ def shift(
 
         values = self.grouping.values
         if is_list_like(fill_value):
-            if len(fill_value) != len(values._data):
+            if len(fill_value) != values._num_columns:
                 raise ValueError(
                     "Mismatched number of columns and values to fill."
                 )
         else:
-            fill_value = [fill_value] * len(values._data)
+            fill_value = [fill_value] * values._num_columns
 
         result = self.obj.__class__._from_data(
             dict(
                 zip(
                     values._column_names,
-                    self._groupby.shift(
-                        [*values._columns], periods, fill_value
-                    )[0],
+                    self._shift(values._columns, periods, fill_value),
                 )
             )
         )
@@ -2674,9 +2965,7 @@ def _mimic_pandas_order(
         # result coming back from libcudf has null_count few rows than
         # the input, so we must produce an ordering from the full
         # input range.
-        _, _, (ordering,) = self._groupby.groups(
-            [as_column(range(0, len(self.obj)))]
-        )
+        _, _, (ordering,) = self._groups([as_column(range(0, len(self.obj)))])
         if self._dropna and any(
             c.has_nulls(include_nan=True) > 0
             for c in self.grouping._key_columns
@@ -3081,7 +3370,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
 
         # drop the first level if we have a multiindex
         if result._data.nlevels > 1:
-            result.columns = result._data.to_pandas_index().droplevel(0)
+            result.columns = result._data.to_pandas_index.droplevel(0)
 
         return result
 
@@ -3219,7 +3508,9 @@ def _handle_by_or_level(self, by=None, level=None):
                 self._handle_level(level)
         else:
             by_list = by if isinstance(by, list) else [by]
-
+            if not len(self._obj) and not len(by_list):
+                # We pretend to groupby an empty column
+                by_list = [cudf.Index._from_column(column_empty(0))]
             for by in by_list:
                 if callable(by):
                     self._handle_callable(by)
@@ -3241,22 +3532,25 @@ def _handle_by_or_level(self, by=None, level=None):
                     except (KeyError, TypeError):
                         self._handle_misc(by)
 
-    @property
+    @functools.cached_property
     def keys(self):
         """Return grouping key columns as index"""
-        nkeys = len(self._key_columns)
-
-        if nkeys == 0:
-            return cudf.Index([], name=None)
-        elif nkeys > 1:
+        if len(self._key_columns) > 1:
             return cudf.MultiIndex._from_data(
-                dict(zip(range(nkeys), self._key_columns))
+                dict(enumerate(self._key_columns))
             )._set_names(self.names)
         else:
             return cudf.Index._from_column(
                 self._key_columns[0], name=self.names[0]
             )
 
+    @property
+    def _values_column_names(self) -> list[Hashable]:
+        # If the key columns are in `obj`, filter them out
+        return [
+            x for x in self._obj._column_names if x not in self._named_columns
+        ]
+
     @property
     def values(self) -> cudf.core.frame.Frame:
         """Return value columns as a frame.
@@ -3267,11 +3561,9 @@ def values(self) -> cudf.core.frame.Frame:
 
         This is mainly used in transform-like operations.
         """
-        # If the key columns are in `obj`, filter them out
-        value_column_names = [
-            x for x in self._obj._column_names if x not in self._named_columns
-        ]
-        value_columns = self._obj._data.select_by_label(value_column_names)
+        value_columns = self._obj._data.select_by_label(
+            self._values_column_names
+        )
         return self._obj.__class__._from_data(value_columns)
 
     def _handle_callable(self, by):
@@ -3329,8 +3621,8 @@ def _handle_misc(self, by):
     def serialize(self):
         header = {}
         frames = []
-        header["names"] = pickle.dumps(self.names)
-        header["_named_columns"] = pickle.dumps(self._named_columns)
+        header["names"] = self.names
+        header["_named_columns"] = self._named_columns
         column_header, column_frames = cudf.core.column.serialize_columns(
             self._key_columns
         )
@@ -3340,8 +3632,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames
         )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cc3d8448151..b535e8aabd2 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1,9 +1,8 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import operator
-import pickle
 import warnings
 from collections.abc import Hashable, MutableMapping
 from functools import cache, cached_property
@@ -32,6 +31,7 @@
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying
 from cudf.core._internals.search import search_sorted
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -337,9 +337,9 @@ def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
-            return column.column_empty(0, masked=False, dtype=self.dtype)
+            return column.column_empty(0, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self) -> Self:
+    def _pandas_repr_compatible(self) -> Self:
         return self
 
     def _is_numeric(self) -> bool:
@@ -497,9 +497,8 @@ def serialize(self):
         header["index_column"]["step"] = self.step
         frames = []
 
-        header["name"] = pickle.dumps(self.name)
-        header["dtype"] = pickle.dumps(self.dtype)
-        header["type-serialized"] = pickle.dumps(type(self))
+        header["name"] = self.name
+        header["dtype"] = self.dtype.str
         header["frame_count"] = 0
         return header, frames
 
@@ -507,11 +506,14 @@ def serialize(self):
     @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
-        name = pickle.loads(header["name"])
+        name = header["name"]
         start = h["start"]
         stop = h["stop"]
         step = h.get("step", 1)
-        return RangeIndex(start=start, stop=stop, step=step, name=name)
+        dtype = np.dtype(header["dtype"])
+        return RangeIndex(
+            start=start, stop=stop, step=step, dtype=dtype, name=name
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -840,14 +842,14 @@ def sort_values(
     @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._column.take(gather_map, nullify, check_bounds),
             name=self.name,
         )
 
     @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._column.apply_boolean_mask(boolean_mask), name=self.name
         )
 
@@ -855,7 +857,7 @@ def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._as_int_index()._split(splits), name=self.name
         )
 
@@ -1125,15 +1127,9 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
             out.name = name
         return out
 
-    @classmethod
     @_performance_tracking
-    def _from_data_like_self(
-        cls, data: MutableMapping, name: Any = no_default
-    ) -> Self:
-        out = _index_from_data(data, name)
-        if name is not no_default:
-            out.name = name
-        return out
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
+        return _index_from_data(data, self.name)
 
     @classmethod
     @_performance_tracking
@@ -1370,7 +1366,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result = copying.scatter([indices], scatter_map, [result])[0]
         result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
@@ -1492,7 +1488,7 @@ def __repr__(self) -> str:
             if isinstance(self._values, StringColumn):
                 output = repr(self.to_pandas(nullable=True))
             else:
-                output = repr(self._clean_nulls_from_index().to_pandas())
+                output = repr(self._pandas_repr_compatible().to_pandas())
                 # We should remove all the single quotes
                 # from the output due to the type-cast to
                 # object dtype happening above.
@@ -1648,20 +1644,6 @@ def __contains__(self, item) -> bool:
         hash(item)
         return item in self._column
 
-    def _clean_nulls_from_index(self) -> Index:
-        if self._values.has_nulls():
-            fill_value = (
-                str(cudf.NaT)
-                if isinstance(self, (DatetimeIndex, TimedeltaIndex))
-                else str(cudf.NA)
-            )
-            return cudf.Index._from_column(
-                self._column.astype("str").fillna(fill_value),
-                name=self.name,
-            )
-
-        return self
-
     def any(self) -> bool:
         return self._column.any()
 
@@ -2345,8 +2327,7 @@ def microsecond(self) -> Index:
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
                 # __mul__ returns an int16 column.
-                self._column.millisecond.astype("int32")
-                * cudf.Scalar(1000, dtype="int32")
+                self._column.millisecond.astype("int32") * np.int32(1000)
             )
             + self._column.microsecond,
             name=self.name,
@@ -2962,13 +2943,13 @@ def median(self, *, skipna: bool = True, axis: int | None = 0):
     def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
         return self._column.std(skipna=skipna, ddof=ddof)
 
-    def total_seconds(self) -> cupy.ndarray:
+    def total_seconds(self) -> Index:
         """
         Return total duration of each element expressed in seconds.
 
         This method is currently not implemented.
         """
-        return self._column.total_seconds().values
+        return Index._from_column(self._column.total_seconds(), name=self.name)
 
     def ceil(self, freq: str) -> Self:
         """
@@ -3613,7 +3594,7 @@ def _is_interval(self) -> bool:
     def _is_boolean(self) -> bool:
         return False
 
-    def _clean_nulls_from_index(self) -> Self:
+    def _pandas_repr_compatible(self) -> Self:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 21ac009e7ff..eded681baf0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 """Base class for Frame types that have an index."""
 
 from __future__ import annotations
@@ -27,6 +27,7 @@
 import cudf
 import cudf._lib as libcudf
 import cudf.core
+import cudf.core._internals
 import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
@@ -37,6 +38,7 @@
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
+from cudf.core._internals import copying
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.column_accessor import ColumnAccessor
@@ -1104,13 +1106,11 @@ def dot(self, other, reflect=False):
             lhs = self.reindex(index=common, copy=False).values
             rhs = other.reindex(index=common, copy=False).values
             if isinstance(other, cudf.DataFrame):
-                result_index = other._data.to_pandas_index()
+                result_index = other._data.to_pandas_index
         elif isinstance(self, cudf.DataFrame) and isinstance(
             other, (cudf.Series, cudf.DataFrame)
         ):
-            common = self._data.to_pandas_index().union(
-                other.index.to_pandas()
-            )
+            common = self._data.to_pandas_index.union(other.index.to_pandas())
             if len(common) > self._num_columns or len(common) > len(
                 other.index
             ):
@@ -1122,7 +1122,7 @@ def dot(self, other, reflect=False):
             rhs = other.reindex(index=common, copy=False).values
             lhs = lhs.values
             if isinstance(other, cudf.DataFrame):
-                result_cols = other._data.to_pandas_index()
+                result_cols = other._data.to_pandas_index
 
         elif isinstance(
             other, (cp.ndarray, np.ndarray)
@@ -2242,7 +2242,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         if not copy:
             raise ValueError("Truncating with copy=False is not supported.")
         axis = self._get_axis_from_axis_arg(axis)
-        ax = self.index if axis == 0 else self._data.to_pandas_index()
+        ax = self.index if axis == 0 else self._data.to_pandas_index
 
         if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
             raise ValueError("truncate requires a sorted index")
@@ -2836,16 +2836,22 @@ def hash_values(
 
         Parameters
         ----------
-        method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
+        method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3'
             Hash function to use:
 
             * murmur3: MurmurHash3 hash function
-            * md5: MD5 hash function
+            * xxhash32: xxHash32 hash function
             * xxhash64: xxHash64 hash function
+            * md5: MD5 hash function
+            * sha1: SHA-1 hash function
+            * sha224: SHA-224 hash function
+            * sha256: SHA-256 hash function
+            * sha384: SHA-384 hash function
+            * sha512: SHA-512 hash function
 
         seed : int, optional
             Seed value to use for the hash function. This parameter is only
-            supported for 'murmur3' and 'xxhash64'.
+            supported for 'murmur3', 'xxhash32', and 'xxhash64'.
 
 
         Returns
@@ -2900,7 +2906,7 @@ def hash_values(
         2    fe061786ea286a515b772d91b0dfcd70
         dtype: object
         """
-        seed_hash_methods = {"murmur3", "xxhash64"}
+        seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"}
         if seed is None:
             seed = 0
         elif method not in seed_hash_methods:
@@ -2914,6 +2920,8 @@ def hash_values(
             )
             if method == "murmur3":
                 plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
+            elif method == "xxhash32":
+                plc_column = plc.hashing.xxhash_32(plc_table, seed)
             elif method == "xxhash64":
                 plc_column = plc.hashing.xxhash_64(plc_table, seed)
             elif method == "md5":
@@ -2952,10 +2960,10 @@ def _gather(
         if not gather_map.nullify and len(self) != gather_map.nrows:
             raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
-            libcudf.copying.gather(
-                list(self.index._columns + self._columns)
+            copying.gather(
+                itertools.chain(self.index._columns, self._columns)
                 if keep_index
-                else list(self._columns),
+                else self._columns,
                 gather_map.column,
                 nullify=gather_map.nullify,
             ),
@@ -3035,16 +3043,24 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
                 keep_index=keep_index,
             )
 
-        columns_to_slice = [
-            *(
-                self.index._columns
-                if keep_index and not has_range_index
-                else []
-            ),
-            *self._columns,
-        ]
+        columns_to_slice = (
+            itertools.chain(self.index._columns, self._columns)
+            if keep_index and not has_range_index
+            else self._columns
+        )
+        with acquire_spill_lock():
+            plc_tables = plc.copying.slice(
+                plc.Table(
+                    [col.to_pylibcudf(mode="read") for col in columns_to_slice]
+                ),
+                [start, stop],
+            )
+            sliced = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_tables[0].columns()
+            ]
         result = self._from_columns_like_self(
-            libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
+            sliced,
             self._column_names,
             None if has_range_index or not keep_index else self.index.names,
         )
@@ -3054,21 +3070,21 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         return result
 
     def _positions_from_column_names(
-        self, column_names, offset_by_index_columns=False
-    ):
+        self,
+        column_names: set[abc.Hashable],
+        offset_by_index_columns: bool = True,
+    ) -> list[int]:
         """Map each column name into their positions in the frame.
 
         Return positions of the provided column names, offset by the number of
         index columns if `offset_by_index_columns` is True. The order of
         indices returned corresponds to the column order in this Frame.
         """
-        num_index_columns = (
-            len(self.index._data) if offset_by_index_columns else 0
-        )
+        start = self.index.nlevels if offset_by_index_columns else 0
         return [
-            i + num_index_columns
-            for i, name in enumerate(self._column_names)
-            if name in set(column_names)
+            i
+            for i, name in enumerate(self._column_names, start=start)
+            if name in column_names
         ]
 
     def drop_duplicates(
@@ -3105,7 +3121,7 @@ def drop_duplicates(
             subset, offset_by_index_columns=not ignore_index
         )
         return self._from_columns_like_self(
-            libcudf.stream_compaction.drop_duplicates(
+            cudf.core._internals.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
                 else list(self.index._columns + self._columns),
@@ -3118,7 +3134,9 @@ def drop_duplicates(
         )
 
     @_performance_tracking
-    def duplicated(self, subset=None, keep="first"):
+    def duplicated(
+        self, subset=None, keep: Literal["first", "last", False] = "first"
+    ) -> cudf.Series:
         """
         Return boolean Series denoting duplicate rows.
 
@@ -3218,11 +3236,26 @@ def duplicated(self, subset=None, keep="first"):
             name = self.name
         else:
             columns = [self._data[n] for n in subset]
-        distinct = libcudf.stream_compaction.distinct_indices(
-            columns, keep=keep
-        )
-        result = libcudf.copying.scatter(
-            [cudf.Scalar(False, dtype=bool)],
+
+        _keep_options = {
+            "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+            "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+            False: plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+        }
+
+        if (keep_option := _keep_options.get(keep)) is None:
+            raise ValueError('keep must be either "first", "last" or False')
+
+        with acquire_spill_lock():
+            plc_column = plc.stream_compaction.distinct_indices(
+                plc.Table([col.to_pylibcudf(mode="read") for col in columns]),
+                keep_option,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+            distinct = libcudf.column.Column.from_pylibcudf(plc_column)
+        result = copying.scatter(
+            [cudf.Scalar(False)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
@@ -3230,14 +3263,26 @@ def duplicated(self, subset=None, keep="first"):
         return cudf.Series._from_column(result, index=self.index, name=name)
 
     @_performance_tracking
-    def _empty_like(self, keep_index=True) -> Self:
+    def _empty_like(self, keep_index: bool = True) -> Self:
+        with acquire_spill_lock():
+            plc_table = plc.copying.empty_like(
+                plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in (
+                            itertools.chain(self.index._columns, self._columns)
+                            if keep_index
+                            else self._columns
+                        )
+                    ]
+                )
+            )
+            columns = [
+                libcudf.column.Column.from_pylibcudf(col)
+                for col in plc_table.columns()
+            ]
         result = self._from_columns_like_self(
-            libcudf.copying.columns_empty_like(
-                [
-                    *(self.index._columns if keep_index else ()),
-                    *self._columns,
-                ]
-            ),
+            columns,
             self._column_names,
             self.index.names if keep_index else None,
         )
@@ -3245,25 +3290,24 @@ def _empty_like(self, keep_index=True) -> Self:
         result._data.rangeindex = self._data.rangeindex
         return result
 
-    def _split(self, splits, keep_index=True):
+    def _split(self, splits, keep_index: bool = True) -> list[Self]:
         if self._num_rows == 0:
             return []
 
-        columns_split = libcudf.copying.columns_split(
-            [
-                *(self.index._columns if keep_index else []),
-                *self._columns,
-            ],
+        columns_split = copying.columns_split(
+            itertools.chain(self.index._columns, self._columns)
+            if keep_index
+            else self._columns,
             splits,
         )
 
         return [
             self._from_columns_like_self(
-                columns_split[i],
+                split,
                 self._column_names,
                 self.index.names if keep_index else None,
             )
-            for i in range(len(splits) + 1)
+            for split in columns_split
         ]
 
     @_performance_tracking
@@ -3507,7 +3551,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
 
         col = _post_process_output_col(ans_col, retty)
 
-        col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
+        col.set_base_mask(ans_mask.as_mask())
         result = cudf.Series._from_column(col, index=self.index)
 
         return result
@@ -3851,7 +3895,6 @@ def _reindex(
                 if name in df._data
                 else cudf.core.column.column.column_empty(
                     dtype=dtypes.get(name, np.float64),
-                    masked=True,
                     row_count=len(index),
                 )
             )
@@ -3970,7 +4013,13 @@ def round(self, decimals=0, how="half_even"):
 
         cols = (
             col.round(decimals[name], how=how)
-            if name in decimals and col.dtype.kind in "fiu"
+            if name in decimals
+            and (
+                col.dtype.kind in "fiu"
+                or isinstance(
+                    col.dtype, (cudf.Decimal32Dtype, cudf.Decimal64Dtype)
+                )
+            )
             else col.copy(deep=True)
             for name, col in self._column_labels_and_values
         )
@@ -4328,12 +4377,10 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
-            libcudf.stream_compaction.drop_nulls(
+            cudf.core._internals.stream_compaction.drop_nulls(
                 [*self.index._columns, *data_columns],
                 how=how,
-                keys=self._positions_from_column_names(
-                    subset, offset_by_index_columns=True
-                ),
+                keys=self._positions_from_column_names(subset),
                 thresh=thresh,
             ),
             self._column_names,
@@ -4353,7 +4400,7 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
                 f"{len(boolean_mask.column)} not {len(self)}"
             )
         return self._from_columns_like_self(
-            libcudf.stream_compaction.apply_boolean_mask(
+            cudf.core._internals.stream_compaction.apply_boolean_mask(
                 list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
@@ -4363,6 +4410,12 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
             index_names=self.index.names if keep_index else None,
         )
 
+    def _pandas_repr_compatible(self) -> Self:
+        """Return Self but with columns prepared for a pandas-like repr."""
+        result = super()._pandas_repr_compatible()
+        result.index = self.index._pandas_repr_compatible()
+        return result
+
     def take(self, indices, axis=0):
         """Return a new frame containing the rows specified by *indices*.
 
@@ -6264,17 +6317,16 @@ def ge(self, other, axis="columns", level=None, fill_value=None):
             other=other, op="__ge__", fill_value=fill_value, can_reindex=True
         )
 
-    def _preprocess_subset(self, subset):
+    def _preprocess_subset(self, subset) -> set[abc.Hashable]:
         if subset is None:
             subset = self._column_names
         elif (
-            not np.iterable(subset)
-            or isinstance(subset, str)
+            is_scalar(subset)
             or isinstance(subset, tuple)
             and subset in self._column_names
         ):
             subset = (subset,)
-        diff = set(subset) - set(self._data)
+        diff = set(subset) - set(self._column_names)
         if len(diff) != 0:
             raise KeyError(f"columns {diff} do not exist")
         return subset
@@ -6362,9 +6414,49 @@ def rank(
             elif source._num_columns != num_cols:
                 dropped_cols = True
 
-        result_columns = libcudf.sort.rank_columns(
-            [*source._columns], method_enum, na_option, ascending, pct
+        column_order = (
+            plc.types.Order.ASCENDING
+            if ascending
+            else plc.types.Order.DESCENDING
         )
+        # ascending
+        #    #top    = na_is_smallest
+        #    #bottom = na_is_largest
+        #    #keep   = na_is_largest
+        # descending
+        #    #top    = na_is_largest
+        #    #bottom = na_is_smallest
+        #    #keep   = na_is_smallest
+        if ascending:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.BEFORE
+            else:
+                null_precedence = plc.types.NullOrder.AFTER
+        else:
+            if na_option == "top":
+                null_precedence = plc.types.NullOrder.AFTER
+            else:
+                null_precedence = plc.types.NullOrder.BEFORE
+        c_null_handling = (
+            plc.types.NullPolicy.EXCLUDE
+            if na_option == "keep"
+            else plc.types.NullPolicy.INCLUDE
+        )
+
+        with acquire_spill_lock():
+            result_columns = [
+                libcudf.column.Column.from_pylibcudf(
+                    plc.sorting.rank(
+                        col.to_pylibcudf(mode="read"),
+                        method_enum,
+                        column_order,
+                        c_null_handling,
+                        null_precedence,
+                        pct,
+                    )
+                )
+                for col in source._columns
+            ]
 
         if dropped_cols:
             result = type(source)._from_data(
@@ -6690,7 +6782,7 @@ def _drop_rows_by_labels(
             return obj.__class__._from_data(
                 join_res.iloc[:, idx_nlv:]._data,
                 index=midx,
-                columns=obj._data.to_pandas_index(),
+                columns=obj._data.to_pandas_index,
             )
 
     else:
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 5c224176730..6e965ceca66 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
-import itertools
 from typing import Any
 
 import pylibcudf as plc
@@ -9,6 +8,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.types import size_type_dtype
+from cudf.core._internals import sorting
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.copy_types import GatherMap
 from cudf.core.join._join_helpers import (
@@ -242,21 +242,13 @@ def _gather_maps(self, left_cols, right_cols):
         # To reorder maps so that they are in order of the input
         # tables, we gather from iota on both right and left, and then
         # sort the gather maps with those two columns as key.
-        key_order = list(
-            itertools.chain.from_iterable(
-                libcudf.copying.gather(
-                    [
-                        cudf.core.column.as_column(
-                            range(n), dtype=size_type_dtype
-                        )
-                    ],
-                    map_,
-                    nullify=null,
-                )
-                for map_, n, null in zip(maps, lengths, nullify)
+        key_order = [
+            cudf.core.column.as_column(range(n), dtype=size_type_dtype).take(
+                map_, nullify=null, check_bounds=False
             )
-        )
-        return libcudf.sort.sort_by_key(
+            for map_, n, null in zip(maps, lengths, nullify)
+        ]
+        return sorting.sort_by_key(
             list(maps),
             # If how is right, right map is primary sort key.
             key_order[:: -1 if self.how == "right" else 1],
@@ -426,7 +418,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
             else:
                 to_sort = [*result._columns]
                 index_names = None
-            result_columns = libcudf.sort.sort_by_key(
+            result_columns = sorting.sort_by_key(
                 to_sort,
                 by,
                 [True] * len(by),
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 173d4e1c584..e7efd01ca85 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1,11 +1,10 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import itertools
 import numbers
 import operator
-import pickle
 import warnings
 from functools import cached_property
 from typing import TYPE_CHECKING, Any
@@ -23,6 +22,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core._internals import copying, sorting
 from cudf.core.algorithms import factorize
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column_accessor import ColumnAccessor
@@ -192,18 +192,16 @@ def __init__(
         source_data = {}
         for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
-                lo, hi = libcudf.reduce.minmax(code)
-                if lo.value < -1 or hi.value > len(level) - 1:
+                lo, hi = code.minmax()
+                if lo < -1 or hi > len(level) - 1:
                     raise ValueError(
                         f"Codes must be -1 <= codes <= {len(level) - 1}"
                     )
-                if lo.value == -1:
+                if lo == -1:
                     # Now we can gather and insert null automatically
                     code[code == -1] = np.iinfo(size_type_dtype).min
-            result_col = libcudf.copying.gather(
-                [level._column], code, nullify=True
-            )
-            source_data[i] = result_col[0]._with_type_metadata(level.dtype)
+            result_col = level._column.take(code, nullify=True)
+            source_data[i] = result_col._with_type_metadata(level.dtype)
 
         super().__init__(ColumnAccessor(source_data))
         self._levels = new_levels
@@ -363,6 +361,13 @@ def _from_data(
             name=name,
         )
 
+    @_performance_tracking
+    def _from_data_like_self(self, data: MutableMapping) -> Self:
+        mi = type(self)._from_data(data, name=self.name)
+        if mi.nlevels == self.nlevels:
+            mi.names = self.names
+        return mi
+
     @classmethod
     def _simple_new(
         cls,
@@ -921,15 +926,15 @@ def take(self, indices) -> Self:
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
-        header["column_names"] = pickle.dumps(self.names)
+        header["column_names"] = self.names
         return header, frames
 
     @classmethod
     @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
-        column_names = pickle.loads(header["column_names"])
-        header["column_names"] = pickle.dumps(range(0, len(column_names)))
+        column_names = header["column_names"]
+        header["column_names"] = range(0, len(column_names))
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
@@ -1125,7 +1130,7 @@ def _concat(cls, objs) -> Self:
         # TODO: Verify if this is really necessary or if we can rely on
         # DataFrame._concat.
         if len(source_data) > 1:
-            colnames = source_data[0]._data.to_pandas_index()
+            colnames = source_data[0]._data.to_pandas_index
             for obj in source_data[1:]:
                 obj.columns = colnames
 
@@ -1678,7 +1683,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
                 f"Expected a list-like or None for `null_position`, got "
                 f"{type(null_position)}"
             )
-        return libcudf.sort.is_sorted(
+        return sorting.is_sorted(
             [*self._columns], ascending=ascending, null_position=null_position
         )
 
@@ -1755,16 +1760,6 @@ def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
 
-    def _clean_nulls_from_index(self) -> Self:
-        """
-        Convert all na values(if any) in MultiIndex object
-        to `<NA>` as a preprocessing step to `__repr__` methods.
-        """
-        index_df = self.to_frame(index=False, name=list(range(self.nlevels)))
-        return MultiIndex.from_frame(
-            index_df._clean_nulls_from_dataframe(index_df), names=self.names
-        )
-
     @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         usage = sum(col.memory_usage for col in self._columns)
@@ -1934,7 +1929,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             )
             scatter_map = libcudf.column.Column.from_pylibcudf(left_plc)
             indices = libcudf.column.Column.from_pylibcudf(right_plc)
-        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result = copying.scatter([indices], scatter_map, [result])[0]
         result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
@@ -2070,7 +2065,7 @@ def _union(self, other, sort=None) -> Self:
 
         result_df = self_df.merge(other_df, on=col_names, how="outer")
         result_df = result_df.sort_values(
-            by=result_df._data.to_pandas_index()[self.nlevels :],
+            by=result_df._data.to_pandas_index[self.nlevels :],
             ignore_index=True,
         )
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index d95d252559f..391ee31f125 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 from __future__ import annotations
 
-import pickle
 import warnings
 from typing import TYPE_CHECKING
 
@@ -26,6 +25,7 @@
 
 import cudf
 from cudf._lib.column import Column
+from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
@@ -97,21 +97,21 @@ def serialize(self):
         header, frames = super().serialize()
         grouping_head, grouping_frames = self.grouping.serialize()
         header["grouping"] = grouping_head
-        header["resampler_type"] = pickle.dumps(type(self))
+        header["resampler_type"] = type(self).__name__
         header["grouping_frames_count"] = len(grouping_frames)
         frames.extend(grouping_frames)
         return header, frames
 
     @classmethod
     def deserialize(cls, header, frames):
-        obj_type = pickle.loads(header["obj_type"])
+        obj_type = Serializable._name_type_map[header["obj_type_name"]]
         obj = obj_type.deserialize(
             header["obj"], frames[: header["num_obj_frames"]]
         )
         grouping = _ResampleGrouping.deserialize(
             header["grouping"], frames[header["num_obj_frames"] :]
         )
-        resampler_cls = pickle.loads(header["resampler_type"])
+        resampler_cls = Serializable._name_type_map[header["resampler_type"]]
         out = resampler_cls.__new__(resampler_cls)
         out.grouping = grouping
         super().__init__(out, obj, by=grouping)
@@ -163,8 +163,8 @@ def serialize(self):
 
     @classmethod
     def deserialize(cls, header, frames):
-        names = pickle.loads(header["names"])
-        _named_columns = pickle.loads(header["_named_columns"])
+        names = header["names"]
+        _named_columns = header["_named_columns"]
         key_columns = cudf.core.column.deserialize_columns(
             header["columns"], frames[: -header["__bin_labels_count"]]
         )
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 84c653c5b3f..0abd42d4d4e 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,7 +12,6 @@
 
 import cudf
 from cudf._lib.column import Column
-from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import is_scalar
@@ -432,8 +431,9 @@ def concat(
 
             result_columns = (
                 objs[0]
-                ._data.to_pandas_index()
-                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                ._data.to_pandas_index.append(
+                    [obj._data.to_pandas_index for obj in objs[1:]]
+                )
                 .unique()
             )
 
@@ -690,7 +690,7 @@ def _tile(A, reps):
     if not value_vars:
         # TODO: Use frame._data.label_dtype when it's more consistently set
         var_data = cudf.Series(
-            value_vars, dtype=frame._data.to_pandas_index().dtype
+            value_vars, dtype=frame._data.to_pandas_index.dtype
         )
     else:
         var_data = (
@@ -1030,7 +1030,8 @@ def as_tuple(x):
                 {
                     name: idx._column
                     for name, idx in zip(
-                        names, target._split(range(nrows, new_size, nrows))
+                        names,
+                        target._split(list(range(nrows, new_size, nrows))),
                     )
                 }
             )
@@ -1273,7 +1274,7 @@ def unstack(df, level, fill_value=None, sort: bool = True):
         res = df.T.stack(future_stack=False)
         # Result's index is a multiindex
         res.index.names = (
-            tuple(df._data.to_pandas_index().names) + df.index.names
+            tuple(df._data.to_pandas_index.names) + df.index.names
         )
         return res
     else:
@@ -1338,7 +1339,11 @@ def _one_hot_encode_column(
             f"np.iinfo({size_type_dtype}).max. Consider reducing "
             "size of category"
         )
-    data = one_hot_encode(column, categories)
+    result_labels = (
+        x if x is not None else "<NA>"
+        for x in categories.to_arrow().to_pylist()
+    )
+    data = dict(zip(result_labels, column.one_hot_encode(categories)))
 
     if drop_first and len(data):
         data.pop(next(iter(data)))
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 80dd0921f9c..7d246960cc9 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -178,13 +178,13 @@ def dtype(self):
     def is_valid(self):
         if not self._is_host_value_current:
             self._device_value_to_host()
-        return not cudf._lib.scalar._is_null_host_scalar(self._host_value)
+        return not cudf.utils.utils._is_null_host_scalar(self._host_value)
 
     def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        valid = not cudf._lib.scalar._is_null_host_scalar(value)
+        valid = not cudf.utils.utils._is_null_host_scalar(value)
 
         if isinstance(value, list):
             if dtype is not None:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 58cefc6554e..805f9f9a9f9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,10 +1,9 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import functools
 import inspect
-import pickle
 import textwrap
 import warnings
 from collections import abc
@@ -17,7 +16,6 @@
 from typing_extensions import Self, assert_never
 
 import cudf
-from cudf import _lib as libcudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -28,7 +26,6 @@
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
-from cudf.core.abc import Serializable
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -415,7 +412,7 @@ def _loc_to_iloc(self, arg):
                 return indices
 
 
-class Series(SingleColumnFrame, IndexedFrame, Serializable):
+class Series(SingleColumnFrame, IndexedFrame):
     """
     One-dimensional GPU array (including time series).
 
@@ -526,7 +523,7 @@ def from_categorical(cls, categorical, codes=None):
 
             mask = None
             if not valid_codes.all():
-                mask = libcudf.transform.bools_to_mask(valid_codes)
+                mask = valid_codes.as_mask()
             col = CategoricalColumn(
                 data=col.data,
                 size=codes.size,
@@ -900,7 +897,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self.index.serialize()
+        header["index"], index_frames = self.index.device_serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -916,8 +913,7 @@ def deserialize(cls, header, frames):
             header, frames[header["index_frame_count"] :]
         )
 
-        idx_typ = pickle.loads(header["index"]["type-serialized"])
-        index = idx_typ.deserialize(header["index"], frames[:index_nframes])
+        index = cls.device_deserialize(header["index"], frames[:index_nframes])
         obj.index = index
 
         return obj
@@ -1453,35 +1449,16 @@ def __repr__(self):
                 warnings.simplefilter("ignore", FutureWarning)
                 preprocess = cudf.concat([top, bottom])
         else:
-            preprocess = self.copy()
-        preprocess.index = preprocess.index._clean_nulls_from_index()
-        if (
-            preprocess.nullable
-            and not isinstance(
-                preprocess.dtype,
-                (
-                    cudf.CategoricalDtype,
-                    cudf.ListDtype,
-                    cudf.StructDtype,
-                    cudf.core.dtypes.DecimalDtype,
-                ),
-            )
-        ) or preprocess.dtype.kind == "m":
-            fill_value = (
-                str(cudf.NaT)
-                if preprocess.dtype.kind in "mM"
-                else str(cudf.NA)
-            )
-            output = repr(
-                preprocess.astype("str").fillna(fill_value).to_pandas()
-            )
-        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
+            preprocess = self
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
+            preprocess = preprocess.copy(deep=False)
+            preprocess.index = preprocess.index._pandas_repr_compatible()
             if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
@@ -1506,7 +1483,7 @@ def __repr__(self):
                 na_rep=str(cudf.NA),
             )
         else:
-            output = repr(preprocess.to_pandas())
+            output = repr(preprocess._pandas_repr_compatible().to_pandas())
 
         lines = output.split("\n")
         if isinstance(preprocess.dtype, cudf.CategoricalDtype):
@@ -3414,7 +3391,7 @@ def describe(
         )
 
     @_performance_tracking
-    def digitize(self, bins, right=False):
+    def digitize(self, bins: np.ndarray, right: bool = False) -> Self:
         """Return the indices of the bins to which each value belongs.
 
         Notes
@@ -3445,9 +3422,8 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series._from_column(
-            cudf.core.column.numerical.digitize(self._column, bins, right),
-            name=self.name,
+        return type(self)._from_column(
+            self._column.digitize(bins, right), name=self.name
         )
 
     @_performance_tracking
@@ -4130,8 +4106,8 @@ def microsecond(self) -> Series:
         # Need to manually promote column to int32 because
         # pandas-matching binop behaviour requires that this
         # __mul__ returns an int16 column.
-        extra = self.series._column.millisecond.astype("int32") * cudf.Scalar(
-            1000, dtype="int32"
+        extra = self.series._column.millisecond.astype("int32") * np.int32(
+            1000
         )
         return self._return_result_like_self(micro + extra)
 
@@ -5188,6 +5164,66 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
+    def total_seconds(self) -> Series:
+        """
+        Return total duration of each element expressed in seconds.
+
+        This method is available directly on TimedeltaIndex
+        and on Series containing timedelta values under the ``.dt`` namespace.
+
+        Returns
+        -------
+        Index or Series
+            When the calling object is a TimedeltaIndex,
+            the return type is an Index with a float64 dtype. When the calling object
+            is a Series, the return type is Series of type `float64` whose
+            index is the same as the original.
+
+        See Also
+        --------
+        datetime.timedelta.total_seconds : Standard library version
+            of this method.
+        TimedeltaIndex.components : Return a DataFrame with components of
+            each Timedelta.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> s = cudf.Series(pd.to_timedelta(np.arange(5), unit="D"))
+        >>> s
+        0    0 days 00:00:00
+        1    1 days 00:00:00
+        2    2 days 00:00:00
+        3    3 days 00:00:00
+        4    4 days 00:00:00
+        dtype: timedelta64[ns]
+
+        >>> s.dt.total_seconds()
+        0         0.0
+        1     86400.0
+        2    172800.0
+        3    259200.0
+        4    345600.0
+        dtype: float64
+
+        **TimedeltaIndex**
+
+        >>> idx = cudf.from_pandas(pd.to_timedelta(np.arange(5), unit="D"))
+        >>> idx
+        TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
+                       dtype='timedelta64[ns]', freq=None)
+
+        >>> idx.total_seconds()
+        Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64')
+        """
+        return self._return_result_like_self(
+            self.series._column.total_seconds()
+        )
+
 
 @_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f6d0664758f..9c8da020ddc 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -380,7 +380,7 @@ def where(self, cond, other=None, inplace=False):
             source_col=self._column, other=other, inplace=inplace
         )
 
-        result = cudf._lib.copying.copy_if_else(input_col, other, cond)
+        result = input_col.copy_if_else(other, cond)
         return result._with_type_metadata(self.dtype)
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index dda1f199078..479838ef2a8 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -8,10 +8,6 @@
 
 import pylibcudf as plc
 
-from cudf._lib.nvtext.subword_tokenize import (
-    subword_tokenize_inmem_hash as cpp_subword_tokenize,
-)
-
 
 def _cast_to_appropriate_type(ar, cast_type):
     if cast_type == "cp":
@@ -210,8 +206,7 @@ def __call__(
         stride = max_length - stride
         # behavior varies from subword_tokenize but maps with huggingface
 
-        input_ids, attention_mask, metadata = cpp_subword_tokenize(
-            text._column,
+        input_ids, attention_mask, metadata = text._column.subword_tokenize(
             self.vocab_file,
             max_sequence_length=max_length,
             stride=stride,
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index 1e31376cce8..58dabc85491 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -1,13 +1,10 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.nvtext.tokenize import (
-    tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
-)
 
 
 class TokenizeVocabulary:
@@ -20,7 +17,7 @@ class TokenizeVocabulary:
         Strings column of vocabulary terms
     """
 
-    def __init__(self, vocabulary: "cudf.Series"):
+    def __init__(self, vocabulary: cudf.Series) -> None:
         self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary(
             vocabulary._column.to_pylibcudf(mode="read")
         )
@@ -45,9 +42,8 @@ def tokenize(
         """
         if delimiter is None:
             delimiter = ""
-        delim = cudf.Scalar(delimiter, dtype="str")
-        result = cpp_tokenize_with_vocabulary(
-            text._column, self.vocabulary, delim, default_id
+        result = text._column.tokenize_with_vocabulary(
+            self.vocabulary, delimiter, default_id
         )
 
         return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 80ee078917a..8be336021b1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -15,9 +15,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.strings.convert.convert_integers import (
-    is_integer as cpp_is_integer,
-)
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
 from cudf.core.buffer import acquire_spill_lock
@@ -232,7 +229,7 @@ def to_datetime(
                         )
                         break
                     elif arg_col.dtype.kind == "O":
-                        if not cpp_is_integer(arg_col).all():
+                        if not arg_col.is_integer().all():
                             col = new_series._column.strptime(
                                 cudf.dtype("datetime64[ns]"), format=format
                             )
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 91f23490031..6d3dc2dc7d9 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -2,14 +2,12 @@
 from __future__ import annotations
 
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
 
 import cudf
-from cudf import _lib as libcudf
-from cudf._lib import strings as libstrings
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core._internals import unary
 from cudf.core.column import as_column
@@ -18,10 +16,16 @@
 from cudf.utils.dtypes import can_convert_to_column
 
 if TYPE_CHECKING:
-    from cudf.core.column import ColumnBase
+    from cudf.core.column.numerical import NumericalColumn
+    from cudf.core.column.string import StringColumn
 
 
-def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
+def to_numeric(
+    arg,
+    errors: Literal["raise", "coerce", "ignore"] = "raise",
+    downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+    dtype_backend=None,
+):
     """
     Convert argument into numerical types.
 
@@ -130,7 +134,9 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         else:
             try:
                 col = _convert_str_col(
-                    col._get_decategorized_column(), errors, downcast
+                    col._get_decategorized_column(),  # type: ignore[attr-defined]
+                    errors,
+                    downcast,
                 )
             except ValueError as e:
                 if errors == "ignore":
@@ -139,7 +145,7 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
                     raise e
     elif is_string_dtype(dtype):
         try:
-            col = _convert_str_col(col, errors, downcast)
+            col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
             if errors == "ignore":
                 return arg
@@ -186,7 +192,11 @@ def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
         return col.values
 
 
-def _convert_str_col(col, errors, _downcast=None):
+def _convert_str_col(
+    col: StringColumn,
+    errors: Literal["raise", "coerce", "ignore"],
+    _downcast: Literal["integer", "signed", "unsigned", "float", None] = None,
+) -> NumericalColumn:
     """
     Converts a string column to numeric column
 
@@ -212,13 +222,21 @@ def _convert_str_col(col, errors, _downcast=None):
     if not is_string_dtype(col):
         raise TypeError("col must be string dtype.")
 
-    is_integer = libstrings.is_integer(col)
-    if is_integer.all():
-        return col.astype(dtype=cudf.dtype("i8"))
+    if col.is_integer().all():
+        return col.astype(dtype=cudf.dtype("i8"))  # type: ignore[return-value]
 
-    col = _proc_inf_empty_strings(col)
+    # TODO: This can be handled by libcudf in
+    # future see StringColumn.as_numerical_column
+    converted_col = (
+        col.to_lower()
+        .find_and_replace(as_column([""]), as_column(["NaN"]))
+        .replace_multiple(
+            as_column(["+", "inf", "inity"]),  # type: ignore[arg-type]
+            as_column(["", "Inf", ""]),  # type: ignore[arg-type]
+        )
+    )
 
-    is_float = libstrings.is_float(col)
+    is_float = converted_col.is_float()
     if is_float.all():
         if _downcast in {"unsigned", "signed", "integer"}:
             warnings.warn(
@@ -227,27 +245,14 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.astype(dtype=cudf.dtype("float32"))
+            return converted_col.astype(dtype=cudf.dtype("float32"))  # type: ignore[return-value]
         else:
-            return col.astype(dtype=cudf.dtype("float64"))
+            return converted_col.astype(dtype=cudf.dtype("float64"))  # type: ignore[return-value]
     else:
         if errors == "coerce":
-            col = libcudf.string_casting.stod(col)
             non_numerics = is_float.unary_operator("not")
-            col[non_numerics] = None
-            return col
+            converted_col[non_numerics] = None
+            converted_col = converted_col.astype(np.dtype(np.float64))  # type: ignore[assignment]
+            return converted_col  # type: ignore[return-value]
         else:
             raise ValueError("Unable to convert some strings to numerics.")
-
-
-def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
-    """Handles empty and infinity strings"""
-    col = col.to_lower()  # type: ignore[attr-defined]
-    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
-    # TODO: This can be handled by libcudf in
-    # future see StringColumn.as_numerical_column
-    col = col.replace_multiple(  # type: ignore[attr-defined]
-        as_column(["+", "inf", "inity"]),
-        as_column(["", "Inf", ""]),
-    )
-    return col
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index bfe716f0afc..4bd5a1e7040 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -9,7 +9,7 @@
 import cupy as cp
 import llvmlite.binding as ll
 import numpy as np
-from cuda import cudart
+from cuda.bindings import runtime
 from numba import cuda, typeof
 from numba.core.datamodel import default_manager, models
 from numba.core.errors import TypingError
@@ -356,8 +356,8 @@ def set_malloc_heap_size(size=None):
     if size is None:
         size = _STRINGS_UDF_DEFAULT_HEAP_SIZE
     if size != _heap_size:
-        (ret,) = cudart.cudaDeviceSetLimit(
-            cudart.cudaLimit.cudaLimitMallocHeapSize, size
+        (ret,) = runtime.cudaDeviceSetLimit(
+            runtime.cudaLimit.cudaLimitMallocHeapSize, size
         )
         if ret.value != 0:
             raise RuntimeError("Unable to set cudaMalloc heap size")
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 094df955273..c4a063a50e8 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-from cudf._lib.reduce import scan
 from cudf.api.types import is_numeric_dtype
 from cudf.core.window.rolling import _RollingBase
 
@@ -194,13 +193,8 @@ def _apply_agg_column(
         # as such we need to convert the nans to nulls before
         # passing them in.
         to_libcudf_column = source_column.astype("float64").nans_to_nulls()
-
-        return scan(
-            agg_name,
-            to_libcudf_column,
-            True,
-            com=self.com,
-            adjust=self.adjust,
+        return to_libcudf_column.scan(
+            agg_name, True, com=self.com, adjust=self.adjust
         )
 
 
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index a580c35ccbf..e2c332f34f5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION
+# Copyright (c) 2020-2025, NVIDIA CORPORATION
 from __future__ import annotations
 
 import warnings
 from typing import TYPE_CHECKING
 
 import numba
+import numpy as np
 import pandas as pd
 from pandas.api.indexers import BaseIndexer
 
@@ -273,12 +274,8 @@ def _apply_agg_column(self, source_column, agg_name):
             end = as_column(end, dtype="int32")
 
             idx = as_column(range(len(start)))
-            preceding_window = (idx - start + cudf.Scalar(1, "int32")).astype(
-                "int32"
-            )
-            following_window = (end - idx - cudf.Scalar(1, "int32")).astype(
-                "int32"
-            )
+            preceding_window = (idx - start + np.int32(1)).astype("int32")
+            following_window = (end - idx - np.int32(1)).astype("int32")
             window = None
         else:
             preceding_window = as_column(self.window)
@@ -315,7 +312,7 @@ def _apply_agg_column(self, source_column, agg_name):
                         {"dtype": source_column.dtype}
                         if callable(agg_name)
                         else self.agg_params,
-                    ).c_obj,
+                    ).plc_obj,
                 )
             )
 
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index e8d634598f4..a91a4951306 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -4,7 +4,6 @@
 import pandas as pd
 
 import cudf
-from cudf._lib.transform import bools_to_mask
 
 __all__ = ["randomdata", "timeseries"]
 
@@ -70,7 +69,7 @@ def timeseries(
             size=len(index),
             p=[1 - nulls_frequency, nulls_frequency],
         )
-        mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
+        mask_buf = cudf.core.column.as_column(mask).as_mask()
         masked_col = gdf[col]._column.set_mask(mask_buf)
         gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index 11730e98c95..dcbdd4423fc 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -3,7 +3,7 @@
 import pylibcudf as plc
 
 import cudf
-from cudf._lib.utils import data_from_pylibcudf_io
+from cudf._lib.column import Column
 from cudf.utils import ioutils
 
 
@@ -33,11 +33,25 @@ def read_avro(
     if not isinstance(skip_rows, int) or skip_rows < 0:
         raise TypeError("skip_rows must be an int >= 0")
 
-    plc_result = plc.io.avro.read_avro(
-        plc.io.types.SourceInfo([filepath_or_buffer]),
-        columns,
-        skip_rows,
-        num_rows,
+    options = (
+        plc.io.avro.AvroReaderOptions.builder(
+            plc.io.types.SourceInfo([filepath_or_buffer])
+        )
+        .skip_rows(skip_rows)
+        .num_rows(num_rows)
+        .build()
     )
 
-    return cudf.DataFrame._from_data(*data_from_pylibcudf_io(plc_result))
+    if columns is not None and len(columns) > 0:
+        options.set_columns(columns)
+
+    plc_result = plc.io.avro.read_avro(options)
+    data = {
+        name: Column.from_pylibcudf(col)
+        for name, col in zip(
+            plc_result.column_names(include_children=False),
+            plc_result.columns,
+            strict=True,
+        )
+    }
+    return cudf.DataFrame._from_data(data)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 3dc8915bfd1..6d617cbf38e 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -1,57 +1,73 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import errno
+import itertools
+import os
 import warnings
 from collections import abc
 from io import BytesIO, StringIO
+from typing import cast
 
 import numpy as np
+import pandas as pd
+
+import pylibcudf as plc
 
 import cudf
-from cudf import _lib as libcudf
-from cudf.api.types import is_scalar
+from cudf._lib.column import Column
+from cudf._lib.types import dtype_to_pylibcudf_type
+from cudf.api.types import is_hashable, is_scalar
+from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
 from cudf.utils.performance_tracking import _performance_tracking
 
+_CSV_HEX_TYPE_MAP = {
+    "hex": np.dtype("int64"),
+    "hex64": np.dtype("int64"),
+    "hex32": np.dtype("int32"),
+}
+
 
 @_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
-    sep=",",
-    delimiter=None,
+    sep: str = ",",
+    delimiter: str | None = None,
     header="infer",
     names=None,
     index_col=None,
     usecols=None,
     prefix=None,
-    mangle_dupe_cols=True,
+    mangle_dupe_cols: bool = True,
     dtype=None,
     true_values=None,
     false_values=None,
-    skipinitialspace=False,
-    skiprows=0,
-    skipfooter=0,
-    nrows=None,
+    skipinitialspace: bool = False,
+    skiprows: int = 0,
+    skipfooter: int = 0,
+    nrows: int | None = None,
     na_values=None,
-    keep_default_na=True,
-    na_filter=True,
-    skip_blank_lines=True,
+    keep_default_na: bool = True,
+    na_filter: bool = True,
+    skip_blank_lines: bool = True,
     parse_dates=None,
-    dayfirst=False,
+    dayfirst: bool = False,
     compression="infer",
-    thousands=None,
-    decimal=".",
-    lineterminator="\n",
-    quotechar='"',
-    quoting=0,
-    doublequote=True,
-    comment=None,
-    delim_whitespace=False,
-    byte_range=None,
+    thousands: str | None = None,
+    decimal: str = ".",
+    lineterminator: str = "\n",
+    quotechar: str = '"',
+    quoting: int = 0,
+    doublequote: bool = True,
+    comment: str | None = None,
+    delim_whitespace: bool = False,
+    byte_range: list[int] | tuple[int, int] | None = None,
     storage_options=None,
-    bytes_per_thread=None,
-):
+    bytes_per_thread: int | None = None,
+) -> cudf.DataFrame:
     """{docstring}"""
 
     if delim_whitespace is not False:
@@ -77,60 +93,233 @@ def read_csv(
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
 
-    df = libcudf.csv.read_csv(
-        filepath_or_buffer,
-        lineterminator=lineterminator,
-        quotechar=quotechar,
-        quoting=quoting,
-        doublequote=doublequote,
-        header=header,
-        mangle_dupe_cols=mangle_dupe_cols,
-        usecols=usecols,
-        sep=sep,
-        delimiter=delimiter,
-        delim_whitespace=delim_whitespace,
-        skipinitialspace=skipinitialspace,
-        names=names,
-        dtype=dtype,
-        skipfooter=skipfooter,
-        skiprows=skiprows,
-        dayfirst=dayfirst,
-        compression=compression,
-        thousands=thousands,
-        decimal=decimal,
-        true_values=true_values,
-        false_values=false_values,
-        nrows=nrows,
-        byte_range=byte_range,
-        skip_blank_lines=skip_blank_lines,
-        parse_dates=parse_dates,
-        comment=comment,
-        na_values=na_values,
-        keep_default_na=keep_default_na,
-        na_filter=na_filter,
-        prefix=prefix,
-        index_col=index_col,
+    if not isinstance(filepath_or_buffer, (BytesIO, StringIO, bytes)):
+        if not os.path.isfile(filepath_or_buffer):
+            raise FileNotFoundError(
+                errno.ENOENT, os.strerror(errno.ENOENT), filepath_or_buffer
+            )
+
+    if isinstance(filepath_or_buffer, StringIO):
+        filepath_or_buffer = filepath_or_buffer.read().encode()
+    elif isinstance(filepath_or_buffer, str) and not os.path.isfile(
+        filepath_or_buffer
+    ):
+        filepath_or_buffer = filepath_or_buffer.encode()
+
+    _validate_args(
+        delimiter,
+        sep,
+        delim_whitespace,
+        decimal,
+        thousands,
+        nrows,
+        skipfooter,
+        byte_range,
+        skiprows,
+    )
+
+    # Alias sep -> delimiter.
+    if delimiter is None:
+        delimiter = sep
+
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = plc.io.types.CompressionType.NONE
+    else:
+        compression_map = {
+            "infer": plc.io.types.CompressionType.AUTO,
+            "gzip": plc.io.types.CompressionType.GZIP,
+            "bz2": plc.io.types.CompressionType.BZIP2,
+            "zip": plc.io.types.CompressionType.ZIP,
+        }
+        c_compression = compression_map[compression]
+
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == "infer":
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == "infer":
+            header = 0
+
+    hex_cols: list[abc.Hashable] = []
+    new_dtypes: list[plc.DataType] | dict[abc.Hashable, plc.DataType] = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = {}
+            for k, col_type in dtype.items():
+                if is_hashable(col_type) and col_type in _CSV_HEX_TYPE_MAP:
+                    col_type = _CSV_HEX_TYPE_MAP[col_type]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif cudf.api.types.is_scalar(dtype) or isinstance(
+            dtype, (np.dtype, pd.api.extensions.ExtensionDtype, type)
+        ):
+            if is_hashable(dtype) and dtype in _CSV_HEX_TYPE_MAP:
+                dtype = _CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            cast(list, new_dtypes).append(_get_plc_data_type_from_dtype(dtype))
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in _CSV_HEX_TYPE_MAP:
+                    col_dtype = _CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(_get_plc_data_type_from_dtype(col_dtype))
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+    options = (
+        plc.io.csv.CsvReaderOptions.builder(
+            plc.io.SourceInfo([filepath_or_buffer])
+        )
+        .compression(c_compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range[0])
+        .byte_range_size(byte_range[1])
+        .nrows(nrows if nrows is not None else -1)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(str(lineterminator))
+        .quotechar(quotechar)
+        .decimal(decimal)
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
     )
 
+    options.set_header(header)
+
+    if names is not None:
+        options.set_names([str(name) for name in names])
+
+    if prefix is not None:
+        options.set_prefix(prefix)
+
+    if usecols is not None:
+        if all(isinstance(col, int) for col in usecols):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name) for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(delimiter)
+
+    if thousands is not None:
+        options.set_thousands(thousands)
+
+    if comment is not None:
+        options.set_comment(comment)
+
+    if parse_dates is not None:
+        options.set_parse_dates(list(parse_dates))
+
+    if hex_cols is not None:
+        options.set_parse_hex(list(hex_cols))
+
+    options.set_dtypes(new_dtypes)
+
+    if true_values is not None:
+        options.set_true_values([str(val) for val in true_values])
+
+    if false_values is not None:
+        options.set_false_values([str(val) for val in false_values])
+
+    if na_values is not None:
+        options.set_na_values([str(val) for val in na_values])
+
+    table_w_meta = plc.io.csv.read_csv(options)
+    data = {
+        name: Column.from_pylibcudf(col)
+        for name, col in zip(
+            table_w_meta.column_names(include_children=False),
+            table_w_meta.columns,
+            strict=True,
+        )
+    }
+
+    df = cudf.DataFrame._from_data(data)
+
+    if isinstance(dtype, abc.Mapping):
+        for k, v in dtype.items():
+            if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
+                df._data[str(k)] = df._data[str(k)].astype(v)
+    elif dtype == "category" or isinstance(dtype, cudf.CategoricalDtype):
+        df = df.astype(dtype)
+    elif isinstance(dtype, abc.Collection) and not is_scalar(dtype):
+        for index, col_dtype in enumerate(dtype):
+            if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
+                col_name = df._column_names[index]
+                df._data[col_name] = df._data[col_name].astype(col_dtype)
+
+    if names is not None and len(names) and isinstance(names[0], int):
+        df.columns = [int(x) for x in df._data]
+    elif (
+        names is None
+        and header == -1
+        and cudf.get_option("mode.pandas_compatible")
+    ):
+        df.columns = [int(x) for x in df._column_names]
+
+    # Set index if the index_col parameter is passed
+    if index_col is not None and index_col is not False:
+        if isinstance(index_col, int):
+            index_col_name = df._data.get_labels_by_index(index_col)[0]
+            df = df.set_index(index_col_name)
+            if (
+                isinstance(index_col_name, str)
+                and names is None
+                and orig_header == "infer"
+            ):
+                if index_col_name.startswith("Unnamed:"):
+                    # TODO: Try to upstream it to libcudf
+                    # csv reader in future
+                    df.index.name = None
+            elif names is None:
+                df.index.name = index_col
+        else:
+            df = df.set_index(index_col)
+
     if dtype is None or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
         specified_dtypes = {} if dtype is None else dtype
-        unspecified_dtypes = {
-            name: dtype
-            for name, dtype in df._dtypes
-            if name not in specified_dtypes
-        }
         default_dtypes = {}
-
-        for name, dt in unspecified_dtypes.items():
-            if dt == np.dtype("i1"):
+        for name, dt in df._dtypes:
+            if name in specified_dtypes:
+                continue
+            elif dt == np.dtype("i1"):
                 # csv reader reads all null column as int8.
                 # The dtype should remain int8.
                 default_dtypes[name] = dt
             else:
                 default_dtypes[name] = _maybe_convert_to_default_type(dt)
-        df = df.astype(default_dtypes)
+
+        if default_dtypes:
+            df = df.astype(default_dtypes)
 
     return df
 
@@ -138,17 +327,17 @@ def read_csv(
 @_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
-    df,
+    df: cudf.DataFrame,
     path_or_buf=None,
-    sep=",",
-    na_rep="",
+    sep: str = ",",
+    na_rep: str = "",
     columns=None,
-    header=True,
-    index=True,
+    header: bool = True,
+    index: bool = True,
     encoding=None,
     compression=None,
-    lineterminator="\n",
-    chunksize=None,
+    lineterminator: str = "\n",
+    chunksize: int | None = None,
     storage_options=None,
 ):
     """{docstring}"""
@@ -187,15 +376,10 @@ def to_csv(
             )
 
     for _, dtype in df._dtypes:
-        if isinstance(dtype, cudf.ListDtype):
-            raise NotImplementedError(
-                "Writing to csv format is not yet supported with "
-                "list columns."
-            )
-        elif isinstance(dtype, cudf.StructDtype):
+        if isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)):
             raise NotImplementedError(
                 "Writing to csv format is not yet supported with "
-                "Struct columns."
+                f"{dtype} columns."
             )
 
     # TODO: Need to typecast categorical columns to the underlying
@@ -208,7 +392,7 @@ def to_csv(
         df = df.copy(deep=False)
         for col_name, col in df._column_labels_and_values:
             if isinstance(col.dtype, cudf.CategoricalDtype):
-                df._data[col_name] = col.astype(col.categories.dtype)
+                df._data[col_name] = col.astype(col.dtype.categories.dtype)
 
         if isinstance(df.index, cudf.CategoricalIndex):
             df.index = df.index.astype(df.index.categories.dtype)
@@ -218,7 +402,7 @@ def to_csv(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            libcudf.csv.write_csv(
+            _plc_write_csv(
                 df,
                 path_or_buf=file_obj,
                 sep=sep,
@@ -229,7 +413,7 @@ def to_csv(
                 index=index,
             )
     else:
-        libcudf.csv.write_csv(
+        _plc_write_csv(
             df,
             path_or_buf=path_or_buf,
             sep=sep,
@@ -243,3 +427,127 @@ def to_csv(
     if return_as_string:
         path_or_buf.seek(0)
         return path_or_buf.read()
+
+
+@acquire_spill_lock()
+def _plc_write_csv(
+    table: cudf.DataFrame,
+    path_or_buf=None,
+    sep: str = ",",
+    na_rep: str = "",
+    header: bool = True,
+    lineterminator: str = "\n",
+    rows_per_chunk: int = 8,
+    index: bool = True,
+) -> None:
+    iter_columns = (
+        itertools.chain(table.index._columns, table._columns)
+        if index
+        else table._columns
+    )
+    columns = [col.to_pylibcudf(mode="read") for col in iter_columns]
+    col_names = []
+    if header:
+        table_names = (
+            na_rep if name is None or pd.isnull(name) else name
+            for name in table._column_names
+        )
+        iter_names = (
+            itertools.chain(table.index.names, table_names)
+            if index
+            else table_names
+        )
+        all_names = list(iter_names)
+        col_names = [
+            '""'
+            if (name in (None, "") and len(all_names) == 1)
+            else (str(name) if name not in (None, "") else "")
+            for name in all_names
+        ]
+    try:
+        plc.io.csv.write_csv(
+            (
+                plc.io.csv.CsvWriterOptions.builder(
+                    plc.io.SinkInfo([path_or_buf]), plc.Table(columns)
+                )
+                .names(col_names)
+                .na_rep(na_rep)
+                .include_header(header)
+                .rows_per_chunk(rows_per_chunk)
+                .line_terminator(str(lineterminator))
+                .inter_column_delimiter(str(sep))
+                .true_value("True")
+                .false_value("False")
+                .build()
+            )
+        )
+    except OverflowError as err:
+        raise OverflowError(
+            f"Writing CSV file with chunksize={rows_per_chunk} failed. "
+            "Consider providing a smaller chunksize argument."
+        ) from err
+
+
+def _validate_args(
+    delimiter: str | None,
+    sep: str,
+    delim_whitespace: bool,
+    decimal: str,
+    thousands: str | None,
+    nrows: int | None,
+    skipfooter: int,
+    byte_range: list[int] | tuple[int, int] | None,
+    skiprows: int,
+) -> None:
+    if delim_whitespace:
+        if delimiter is not None:
+            raise ValueError("cannot set both delimiter and delim_whitespace")
+        if sep != ",":
+            raise ValueError("cannot set both sep and delim_whitespace")
+
+    # Alias sep -> delimiter.
+    actual_delimiter = delimiter if delimiter else sep
+
+    if decimal == actual_delimiter:
+        raise ValueError("decimal cannot be the same as delimiter")
+
+    if thousands == actual_delimiter:
+        raise ValueError("thousands cannot be the same as delimiter")
+
+    if nrows is not None and skipfooter != 0:
+        raise ValueError("cannot use both nrows and skipfooter parameters")
+
+    if byte_range is not None:
+        if skipfooter != 0 or skiprows != 0 or nrows is not None:
+            raise ValueError(
+                "cannot manually limit rows to be read when using the byte range parameter"
+            )
+
+
+def _get_plc_data_type_from_dtype(dtype) -> plc.DataType:
+    # TODO: Remove this work-around Dictionary types
+    # in libcudf are fully mapped to categorical columns:
+    # https://github.com/rapidsai/cudf/issues/3960
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
+
+    if isinstance(dtype, str):
+        if dtype == "date32":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_DAYS)
+        elif dtype in ("date", "date64"):
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[us]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype == "timestamp[s]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_SECONDS)
+        elif dtype == "timestamp[ms]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype == "timestamp[ns]":
+            return plc.DataType(plc.types.TypeId.TIMESTAMP_NANOSECONDS)
+
+    dtype = cudf.dtype(dtype)
+    return dtype_to_pylibcudf_type(dtype)
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index fe8e446f9c0..3b3fd5f7c56 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import interop as libdlpack
 from cudf.core.column import ColumnBase
 from cudf.utils import ioutils
 
 
-def from_dlpack(pycapsule_obj):
+def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
     """Converts from a DLPack tensor to a cuDF object.
 
     DLPack is an open-source memory tensor structure:
@@ -33,18 +34,21 @@ def from_dlpack(pycapsule_obj):
     cuDF from_dlpack() assumes column-major (Fortran order) input. If the input
     tensor is row-major, transpose it before passing it to this function.
     """
+    plc_table = plc.interop.from_dlpack(pycapsule_obj)
+    data = dict(
+        enumerate(
+            (ColumnBase.from_pylibcudf(col) for col in plc_table.columns())
+        )
+    )
 
-    columns = libdlpack.from_dlpack(pycapsule_obj)
-    data = dict(enumerate(columns))
-
-    if len(columns) == 1:
+    if len(data) == 1:
         return cudf.Series._from_data(data)
     else:
         return cudf.DataFrame._from_data(data)
 
 
 @ioutils.doc_to_dlpack()
-def to_dlpack(cudf_obj):
+def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
     """Converts a cuDF object to a DLPack tensor.
 
     DLPack is an open-source memory tensor structure:
@@ -80,13 +84,14 @@ def to_dlpack(cudf_obj):
 
     if any(
         not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
-        for _, dtype in gdf._dtypes
+        for _, dtype in gdf._dtypes  # type: ignore[union-attr]
     ):
         raise TypeError("non-numeric data not yet supported")
 
     dtype = cudf.utils.dtypes.find_common_type(
-        [dtype for _, dtype in gdf._dtypes]
+        [dtype for _, dtype in gdf._dtypes]  # type: ignore[union-attr]
     )
     gdf = gdf.astype(dtype)
-
-    return libdlpack.to_dlpack([*gdf._columns])
+    return plc.interop.to_dlpack(
+        plc.Table([col.to_pylibcudf(mode="read") for col in gdf._columns])
+    )
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 89af00c713d..ff326e09315 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -15,7 +15,6 @@
 import cudf
 from cudf._lib.column import Column
 from cudf._lib.types import dtype_to_pylibcudf_type
-from cudf._lib.utils import _data_from_columns, data_from_pylibcudf_io
 from cudf.core.buffer import acquire_spill_lock
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
@@ -54,6 +53,22 @@ def _get_cudf_schema_element_from_dtype(
     return lib_type, child_types
 
 
+def _to_plc_compression(
+    compression: Literal["infer", "gzip", "bz2", "zip", "xz", None],
+) -> plc.io.types.CompressionType:
+    if compression is not None:
+        if compression == "gzip":
+            return plc.io.types.CompressionType.GZIP
+        elif compression == "bz2":
+            return plc.io.types.CompressionType.BZIP2
+        elif compression == "zip":
+            return plc.io.types.CompressionType.ZIP
+        else:
+            return plc.io.types.CompressionType.AUTO
+    else:
+        return plc.io.types.CompressionType.NONE
+
+
 @ioutils.doc_read_json()
 def read_json(
     path_or_buf,
@@ -91,11 +106,6 @@ def read_json(
         if dtype is None:
             dtype = True
 
-        if kwargs:
-            raise ValueError(
-                "cudf engine doesn't support the "
-                f"following keyword arguments: {list(kwargs.keys())}"
-            )
         if args:
             raise ValueError(
                 "cudf engine doesn't support the "
@@ -120,17 +130,7 @@ def read_json(
             if isinstance(source, str) and not os.path.isfile(source):
                 filepaths_or_buffers[idx] = source.encode()
 
-        if compression is not None:
-            if compression == "gzip":
-                c_compression = plc.io.types.CompressionType.GZIP
-            elif compression == "bz2":
-                c_compression = plc.io.types.CompressionType.BZIP2
-            elif compression == "zip":
-                c_compression = plc.io.types.CompressionType.ZIP
-            else:
-                c_compression = plc.io.types.CompressionType.AUTO
-        else:
-            c_compression = plc.io.types.CompressionType.NONE
+        c_compression = _to_plc_compression(compression)
 
         if on_bad_lines.lower() == "error":
             c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL
@@ -166,43 +166,53 @@ def read_json(
         if cudf.get_option("io.json.low_memory") and lines:
             res_cols, res_col_names, res_child_names = (
                 plc.io.json.chunked_read_json(
+                    plc.io.json._setup_json_reader_options(
+                        plc.io.SourceInfo(filepaths_or_buffers),
+                        processed_dtypes,
+                        c_compression,
+                        keep_quotes=keep_quotes,
+                        mixed_types_as_string=mixed_types_as_string,
+                        prune_columns=prune_columns,
+                        recovery_mode=c_on_bad_lines,
+                    )
+                )
+            )
+            data = {
+                name: Column.from_pylibcudf(col)
+                for name, col in zip(res_col_names, res_cols, strict=True)
+            }
+            df = cudf.DataFrame._from_data(data)
+            ioutils._add_df_col_struct_names(df, res_child_names)
+            return df
+        else:
+            table_w_meta = plc.io.json.read_json(
+                plc.io.json._setup_json_reader_options(
                     plc.io.SourceInfo(filepaths_or_buffers),
                     processed_dtypes,
                     c_compression,
+                    lines,
+                    byte_range_offset=byte_range[0]
+                    if byte_range is not None
+                    else 0,
+                    byte_range_size=byte_range[1]
+                    if byte_range is not None
+                    else 0,
                     keep_quotes=keep_quotes,
                     mixed_types_as_string=mixed_types_as_string,
                     prune_columns=prune_columns,
                     recovery_mode=c_on_bad_lines,
+                    extra_parameters=kwargs,
                 )
             )
-            df = cudf.DataFrame._from_data(
-                *_data_from_columns(
-                    columns=[Column.from_pylibcudf(col) for col in res_cols],
-                    column_names=res_col_names,
-                    index_names=None,
+            data = {
+                name: Column.from_pylibcudf(col)
+                for name, col in zip(
+                    table_w_meta.column_names(include_children=False),
+                    table_w_meta.columns,
+                    strict=True,
                 )
-            )
-            ioutils._add_df_col_struct_names(df, res_child_names)
-            return df
-        else:
-            table_w_meta = plc.io.json.read_json(
-                plc.io.SourceInfo(filepaths_or_buffers),
-                processed_dtypes,
-                c_compression,
-                lines,
-                byte_range_offset=byte_range[0]
-                if byte_range is not None
-                else 0,
-                byte_range_size=byte_range[1] if byte_range is not None else 0,
-                keep_quotes=keep_quotes,
-                mixed_types_as_string=mixed_types_as_string,
-                prune_columns=prune_columns,
-                recovery_mode=c_on_bad_lines,
-            )
-
-            df = cudf.DataFrame._from_data(
-                *data_from_pylibcudf_io(table_w_meta)
-            )
+            }
+            df = cudf.DataFrame._from_data(data)
 
             # Post-processing to add in struct column names
             ioutils._add_df_col_struct_names(df, table_w_meta.child_names)
@@ -289,23 +299,29 @@ def _plc_write_json(
     include_nulls: bool = True,
     lines: bool = False,
     rows_per_chunk: int = 1024 * 64,  # 64K rows
+    compression: Literal["infer", "gzip", "bz2", "zip", "xz", None] = None,
 ) -> None:
     try:
-        plc.io.json.write_json(
-            plc.io.SinkInfo([path_or_buf]),
-            plc.io.TableWithMetadata(
-                plc.Table(
-                    [col.to_pylibcudf(mode="read") for col in table._columns]
-                ),
-                colnames,
+        tbl_w_meta = plc.io.TableWithMetadata(
+            plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
             ),
-            na_rep,
-            include_nulls,
-            lines,
-            rows_per_chunk,
-            true_value="true",
-            false_value="false",
+            colnames,
+        )
+        options = (
+            plc.io.json.JsonWriterOptions.builder(
+                plc.io.SinkInfo([path_or_buf]), tbl_w_meta.tbl
+            )
+            .metadata(tbl_w_meta)
+            .na_rep(na_rep)
+            .include_nulls(include_nulls)
+            .lines(lines)
+            .compression(_to_plc_compression(compression))
+            .build()
         )
+        if rows_per_chunk != np.iinfo(np.int32).max:
+            options.set_rows_per_chunk(rows_per_chunk)
+        plc.io.json.write_json(options)
     except OverflowError as err:
         raise OverflowError(
             f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 68b60809bb9..f3124552fd1 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,147 +1,29 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
-import datetime
+import itertools
 import warnings
+from typing import TYPE_CHECKING, Literal
 
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import orc as liborc
+from cudf._lib.column import Column
+from cudf._lib.types import dtype_to_pylibcudf_type
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
+from cudf.core.index import _index_from_data
 from cudf.utils import ioutils
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
 
-def _make_empty_df(filepath_or_buffer, columns):
-    from pyarrow import orc
-
-    orc_file = orc.ORCFile(filepath_or_buffer)
-    schema = orc_file.schema
-    col_names = schema.names if columns is None else columns
-    return cudf.DataFrame._from_data(
-        data={
-            col_name: cudf.core.column.column_empty(
-                row_count=0,
-                dtype=schema.field(col_name).type.to_pandas_dtype(),
-            )
-            for col_name in col_names
-        }
-    )
-
-
-def _parse_column_statistics(cs, column_statistics_blob):
-    # Initialize stats to return and parse stats blob
-    column_statistics = {}
-    cs.ParseFromString(column_statistics_blob)
-
-    # Load from parsed stats blob into stats to return
-    if cs.HasField("numberOfValues"):
-        column_statistics["number_of_values"] = cs.numberOfValues
-    if cs.HasField("hasNull"):
-        column_statistics["has_null"] = cs.hasNull
-
-    if cs.HasField("intStatistics"):
-        column_statistics["minimum"] = (
-            cs.intStatistics.minimum
-            if cs.intStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.intStatistics.maximum
-            if cs.intStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.intStatistics.sum if cs.intStatistics.HasField("sum") else None
-        )
-
-    elif cs.HasField("doubleStatistics"):
-        column_statistics["minimum"] = (
-            cs.doubleStatistics.minimum
-            if cs.doubleStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.doubleStatistics.maximum
-            if cs.doubleStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = (
-            cs.doubleStatistics.sum
-            if cs.doubleStatistics.HasField("sum")
-            else None
-        )
-
-    elif cs.HasField("stringStatistics"):
-        column_statistics["minimum"] = (
-            cs.stringStatistics.minimum
-            if cs.stringStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.stringStatistics.maximum
-            if cs.stringStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.stringStatistics.sum
-
-    elif cs.HasField("bucketStatistics"):
-        column_statistics["true_count"] = cs.bucketStatistics.count[0]
-        column_statistics["false_count"] = (
-            column_statistics["number_of_values"]
-            - column_statistics["true_count"]
-        )
-
-    elif cs.HasField("decimalStatistics"):
-        column_statistics["minimum"] = (
-            cs.decimalStatistics.minimum
-            if cs.decimalStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            cs.decimalStatistics.maximum
-            if cs.decimalStatistics.HasField("maximum")
-            else None
-        )
-        column_statistics["sum"] = cs.decimalStatistics.sum
-
-    elif cs.HasField("dateStatistics"):
-        column_statistics["minimum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.minimum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("minimum")
-            else None
-        )
-        column_statistics["maximum"] = (
-            datetime.datetime.fromtimestamp(
-                datetime.timedelta(cs.dateStatistics.maximum).total_seconds(),
-                datetime.timezone.utc,
-            )
-            if cs.dateStatistics.HasField("maximum")
-            else None
-        )
-
-    elif cs.HasField("timestampStatistics"):
-        # Before ORC-135, the local timezone offset was included and they were
-        # stored as minimum and maximum. After ORC-135, the timestamp is
-        # adjusted to UTC before being converted to milliseconds and stored
-        # in minimumUtc and maximumUtc.
-        # TODO: Support minimum and maximum by reading writer's local timezone
-        if cs.timestampStatistics.HasField(
-            "minimumUtc"
-        ) and cs.timestampStatistics.HasField("maximumUtc"):
-            column_statistics["minimum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.minimumUtc / 1000, datetime.timezone.utc
-            )
-            column_statistics["maximum"] = datetime.datetime.fromtimestamp(
-                cs.timestampStatistics.maximumUtc / 1000, datetime.timezone.utc
-            )
-
-    elif cs.HasField("binaryStatistics"):
-        column_statistics["sum"] = cs.binaryStatistics.sum
-
-    return column_statistics
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
 
 
 @ioutils.doc_read_orc_metadata()
@@ -175,11 +57,12 @@ def read_orc_statistics(
         path_or_buf = ioutils._select_single_source(
             path_or_buf, "read_orc_statistics"
         )
-        (
-            column_names,
-            parsed_file_statistics,
-            parsed_stripes_statistics,
-        ) = liborc.read_parsed_orc_statistics(path_or_buf)
+        parsed = plc.io.orc.read_parsed_orc_statistics(
+            plc.io.SourceInfo([path_or_buf])
+        )
+        column_names = parsed.column_names
+        parsed_file_statistics = parsed.file_stats
+        parsed_stripes_statistics = parsed.stripes_stats
 
         # Parse file statistics
         file_statistics = {
@@ -273,16 +156,14 @@ def read_orc(
     columns=None,
     filters=None,
     stripes=None,
-    skiprows=None,
-    num_rows=None,
-    use_index=True,
+    skiprows: int | None = None,
+    num_rows: int | None = None,
+    use_index: bool = True,
     timestamp_type=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
     """{docstring}"""
-    from cudf import DataFrame
-
     if skiprows is not None:
         # Do not remove until cuIO team approves its removal.
         warnings.warn(
@@ -329,31 +210,168 @@ def read_orc(
 
         # Return empty if everything was filtered
         if len(selected_stripes) == 0:
-            return _make_empty_df(filepaths_or_buffers[0], columns)
+            from pyarrow import orc
+
+            orc_file = orc.ORCFile(filepaths_or_buffers[0])
+            schema = orc_file.schema
+            col_names = schema.names if columns is None else columns
+            return cudf.DataFrame._from_data(
+                data={
+                    col_name: cudf.core.column.column_empty(
+                        row_count=0,
+                        dtype=schema.field(col_name).type.to_pandas_dtype(),
+                    )
+                    for col_name in col_names
+                }
+            )
         else:
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        if columns is not None:
+            columns = [str(col) for col in columns]
+
+        if skiprows is None:
+            skiprows = 0
+        elif not isinstance(skiprows, int) or skiprows < 0:
+            raise TypeError("skiprows must be an int >= 0")
+
+        if num_rows is None:
+            num_rows = -1
+        elif not isinstance(num_rows, int) or num_rows < -1:
+            raise TypeError("num_rows must be an int >= -1")
+
+        options = (
+            plc.io.orc.OrcReaderOptions.builder(
+                plc.io.types.SourceInfo(filepaths_or_buffers)
             )
+            .use_index(use_index)
+            .build()
         )
+        if num_rows >= 0:
+            options.set_num_rows(num_rows)
+        if skiprows >= 0:
+            options.set_skip_rows(skiprows)
+        if stripes is not None and len(stripes) > 0:
+            options.set_stripes(stripes)
+        if timestamp_type is not None:
+            options.set_timestamp_type(
+                dtype_to_pylibcudf_type(cudf.dtype(timestamp_type))
+            )
+        if columns is not None and len(columns) > 0:
+            options.set_columns(columns)
+
+        tbl_w_meta = plc.io.orc.read_orc(options)
+
+        if isinstance(columns, list) and len(columns) == 0:
+            # When `columns=[]`, index needs to be
+            # established, but not the columns.
+            nrows = tbl_w_meta.tbl.num_rows()
+            data = {}
+            index = cudf.RangeIndex(nrows)
+        else:
+            names = tbl_w_meta.column_names(include_children=False)
+            index_col = None
+            is_range_index = False
+            reset_index_name = False
+            range_idx = None
+
+            if len(tbl_w_meta.per_file_user_data) > 0:
+                json_str = (
+                    tbl_w_meta.per_file_user_data[0]
+                    .get(b"pandas", b"")
+                    .decode("utf-8")
+                )
+                if json_str != "":
+                    meta = json.loads(json_str)
+                    if (
+                        "index_columns" in meta
+                        and len(meta["index_columns"]) > 0
+                    ):
+                        index_col = meta["index_columns"]
+                        if (
+                            isinstance(index_col[0], dict)
+                            and index_col[0]["kind"] == "range"
+                        ):
+                            is_range_index = True
+                        else:
+                            index_col_names = {}
+                            for idx_col in index_col:
+                                for c in meta["columns"]:
+                                    if c["field_name"] == idx_col:
+                                        index_col_names[idx_col] = (
+                                            c["name"] or c["field_name"]
+                                        )
+                                        if c["name"] is None:
+                                            reset_index_name = True
+
+            actual_index_names = None
+            col_names = names
+            if index_col is not None and len(index_col) > 0:
+                if is_range_index:
+                    range_index_meta = index_col[0]
+                    range_idx = cudf.RangeIndex(
+                        start=range_index_meta["start"],
+                        stop=range_index_meta["stop"],
+                        step=range_index_meta["step"],
+                        name=range_index_meta["name"],
+                    )
+                    if skiprows != 0:
+                        range_idx = range_idx[skiprows:]
+                    if num_rows != -1:
+                        range_idx = range_idx[:num_rows]
+                else:
+                    actual_index_names = list(index_col_names.values())
+                    col_names = names[len(actual_index_names) :]
+
+            result_col_names = col_names if columns is None else names
+            if actual_index_names is None:
+                index = None
+                data = {
+                    name: Column.from_pylibcudf(col)
+                    for name, col in zip(
+                        result_col_names, tbl_w_meta.columns, strict=True
+                    )
+                }
+            else:
+                result_columns = [
+                    Column.from_pylibcudf(col) for col in tbl_w_meta.columns
+                ]
+                index = _index_from_data(
+                    dict(
+                        zip(
+                            actual_index_names,
+                            result_columns[: len(actual_index_names)],
+                            strict=True,
+                        )
+                    )
+                )
+                data = dict(
+                    zip(
+                        result_col_names,
+                        result_columns[len(actual_index_names) :],
+                        strict=True,
+                    )
+                )
+
+            if is_range_index:
+                index = range_idx
+            elif reset_index_name:
+                index.names = [None] * len(index.names)
+
+            child_name_values = tbl_w_meta.child_names.values()
+
+            data = {
+                name: ioutils._update_col_struct_field_names(col, child_names)
+                for (name, col), child_names in zip(
+                    data.items(), child_name_values
+                )
+            }
+
+        return cudf.DataFrame._from_data(data, index=index)
     else:
         from pyarrow import orc
 
-        def read_orc_stripe(orc_file, stripe, columns):
-            pa_table = orc_file.read_stripe(stripe, columns)
-            if isinstance(pa_table, pa.RecordBatch):
-                pa_table = pa.Table.from_batches([pa_table])
-            return pa_table
-
         warnings.warn("Using CPU via PyArrow to read ORC dataset.")
         if len(filepath_or_buffer) > 1:
             raise NotImplementedError(
@@ -364,11 +382,18 @@ def read_orc_stripe(orc_file, stripe, columns):
         orc_file = orc.ORCFile(filepath_or_buffer[0])
         if stripes is not None and len(stripes) > 0:
             for stripe_source_file in stripes:
-                pa_tables = [
-                    read_orc_stripe(orc_file, i, columns)
+                pa_tables = (
+                    orc_file.read_stripe(i, columns)
                     for i in stripe_source_file
-                ]
-                pa_table = pa.concat_tables(pa_tables)
+                )
+                pa_table = pa.concat_tables(
+                    [
+                        pa.Table.from_batches([table])
+                        if isinstance(table, pa.RecordBatch)
+                        else table
+                        for table in pa_tables
+                    ]
+                )
         else:
             pa_table = orc_file.read(columns=columns)
         df = cudf.DataFrame.from_arrow(pa_table)
@@ -378,16 +403,18 @@ def read_orc_stripe(orc_file, stripe, columns):
 
 @ioutils.doc_to_orc()
 def to_orc(
-    df,
+    df: cudf.DataFrame,
     fname,
-    compression="snappy",
-    statistics="ROWGROUP",
-    stripe_size_bytes=None,
-    stripe_size_rows=None,
-    row_index_stride=None,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
     cols_as_map_type=None,
     storage_options=None,
-    index=None,
+    index: bool | None = None,
 ):
     """{docstring}"""
 
@@ -413,7 +440,7 @@ def to_orc(
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
             file_obj = ioutils.get_IOBase_writer(file_obj)
-            liborc.write_orc(
+            _plc_write_orc(
                 df,
                 file_obj,
                 compression,
@@ -425,7 +452,7 @@ def to_orc(
                 index,
             )
     else:
-        liborc.write_orc(
+        _plc_write_orc(
             df,
             path_or_buf,
             compression,
@@ -438,4 +465,279 @@ def to_orc(
         )
 
 
-ORCWriter = liborc.ORCWriter
+@acquire_spill_lock()
+def _plc_write_orc(
+    table: cudf.DataFrame,
+    path_or_buf,
+    compression: Literal[
+        False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+    ] = "SNAPPY",
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+    stripe_size_bytes: int | None = None,
+    stripe_size_rows: int | None = None,
+    row_index_stride: int | None = None,
+    cols_as_map_type=None,
+    index: bool | None = None,
+) -> None:
+    """
+    See `cudf::io::write_orc`.
+
+    See Also
+    --------
+    cudf.read_orc
+    """
+    user_data = {"pandas": ioutils.generate_pandas_metadata(table, index)}
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = (
+            table._columns
+            if table.index is None
+            else itertools.chain(table.index._columns, table._columns)
+        )
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table._index.names):
+            tbl_meta.column_metadata[level].set_name(
+                ioutils._index_level_name(idx_name, level, table._column_names)  # type: ignore[arg-type]
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    has_map_type = False
+    if cols_as_map_type is not None:
+        cols_as_map_type = set(cols_as_map_type)
+        has_map_type = True
+
+    for i, (name, col) in enumerate(
+        table._column_labels_and_values, start=num_index_cols_meta
+    ):
+        tbl_meta.column_metadata[i].set_name(name)
+        _set_col_children_metadata(
+            col,
+            tbl_meta.column_metadata[i],
+            has_map_type and name in cols_as_map_type,
+        )
+
+    options = (
+        plc.io.orc.OrcWriterOptions.builder(
+            plc.io.SinkInfo([path_or_buf]), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(_get_comp_type(compression))
+        .enable_statistics(_get_orc_stat_freq(statistics))
+        .build()
+    )
+    if stripe_size_bytes is not None:
+        options.set_stripe_size_bytes(stripe_size_bytes)
+    if stripe_size_rows is not None:
+        options.set_stripe_size_rows(stripe_size_rows)
+    if row_index_stride is not None:
+        options.set_row_index_stride(row_index_stride)
+
+    plc.io.orc.write_orc(options)
+
+
+class ORCWriter:
+    """
+    ORCWriter lets you you incrementally write out a ORC file from a series
+    of cudf tables
+
+    See Also
+    --------
+    cudf.io.orc.to_orc
+    """
+
+    def __init__(
+        self,
+        path,
+        index: bool | None = None,
+        compression: Literal[
+            False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"
+        ] = "SNAPPY",
+        statistics: Literal["NONE", "STRIPE", "ROWGROUP"] = "ROWGROUP",
+        cols_as_map_type=None,
+        stripe_size_bytes: int | None = None,
+        stripe_size_rows: int | None = None,
+        row_index_stride: int | None = None,
+    ):
+        self.sink = plc.io.SinkInfo([path])
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.cols_as_map_type = (
+            cols_as_map_type
+            if cols_as_map_type is None
+            else set(cols_as_map_type)
+        )
+        self.stripe_size_bytes = stripe_size_bytes
+        self.stripe_size_rows = stripe_size_rows
+        self.row_index_stride = row_index_stride
+        self.initialized = False
+
+    def write_table(self, table):
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(table)
+
+        keep_index = self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        )
+        if keep_index:
+            cols_to_write = itertools.chain(
+                table.index._columns, table._columns
+            )
+        else:
+            cols_to_write = table._columns
+
+        self.writer.write(
+            plc.Table([col.to_pylibcudf(mode="read") for col in cols_to_write])
+        )
+
+    def close(self):
+        if not self.initialized:
+            return
+        self.writer.close()
+
+    def _initialize_chunked_state(self, table):
+        """
+        Prepare all the values required to build the
+        chunked_orc_writer_options anb creates a writer
+        """
+
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        has_map_type = self.cols_as_map_type is not None
+        for i, (name, col) in enumerate(
+            table._column_labels_and_values, start=num_index_cols_meta
+        ):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_children_metadata(
+                col,
+                self.tbl_meta.column_metadata[i],
+                has_map_type and name in self.cols_as_map_type,
+            )
+
+        user_data = {
+            "pandas": ioutils.generate_pandas_metadata(table, self.index)
+        }
+
+        options = (
+            plc.io.orc.ChunkedOrcWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(_get_comp_type(self.compression))
+            .enable_statistics(_get_orc_stat_freq(self.statistics))
+            .build()
+        )
+        if self.stripe_size_bytes is not None:
+            options.set_stripe_size_bytes(self.stripe_size_bytes)
+        if self.stripe_size_rows is not None:
+            options.set_stripe_size_rows(self.stripe_size_rows)
+        if self.row_index_stride is not None:
+            options.set_row_index_stride(self.row_index_stride)
+
+        self.writer = plc.io.orc.OrcChunkedWriter.from_options(options)
+
+        self.initialized = True
+
+
+def _get_comp_type(
+    compression: Literal[False, None, "SNAPPY", "ZLIB", "ZSTD", "LZ4"],
+) -> plc.io.types.CompressionType:
+    if compression is None or compression is False:
+        return plc.io.types.CompressionType.NONE
+
+    normed_compression = compression.upper()
+    if normed_compression == "SNAPPY":
+        return plc.io.types.CompressionType.SNAPPY
+    elif normed_compression == "ZLIB":
+        return plc.io.types.CompressionType.ZLIB
+    elif normed_compression == "ZSTD":
+        return plc.io.types.CompressionType.ZSTD
+    elif normed_compression == "LZ4":
+        return plc.io.types.CompressionType.LZ4
+    else:
+        raise ValueError(f"Unsupported `compression` type {compression}")
+
+
+def _get_orc_stat_freq(
+    statistics: Literal["NONE", "STRIPE", "ROWGROUP"],
+) -> plc.io.types.StatisticsFreq:
+    """
+    Convert ORC statistics terms to CUDF convention:
+      - ORC "STRIPE"   == CUDF "ROWGROUP"
+      - ORC "ROWGROUP" == CUDF "PAGE"
+    """
+    normed_statistics = statistics.upper()
+    if normed_statistics == "NONE":
+        return plc.io.types.StatisticsFreq.STATISTICS_NONE
+    elif normed_statistics == "STRIPE":
+        return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP
+    elif normed_statistics == "ROWGROUP":
+        return plc.io.types.StatisticsFreq.STATISTICS_PAGE
+    else:
+        raise ValueError(f"Unsupported `statistics_freq` type {statistics}")
+
+
+def _set_col_children_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    list_column_as_map: bool = False,
+) -> None:
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_children_metadata(
+                child_col, col_meta.child(i), list_column_as_map
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if list_column_as_map:
+            col_meta.set_list_column_as_map()
+        _set_col_children_metadata(
+            col.children[1], col_meta.child(1), list_column_as_map
+        )
+    else:
+        return
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 2382e9f12ed..feb6e12da8c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
 import itertools
 import math
 import operator
@@ -10,23 +11,36 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 from uuid import uuid4
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
+import pylibcudf as plc
+
 import cudf
-from cudf._lib import parquet as libparquet
+from cudf._lib.column import Column
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column, column_empty
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
+try:
+    import ujson as json  # type: ignore[import-untyped]
+except ImportError:
+    import json
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
+
+    from typing_extensions import Self
+
+    from cudf.core.column import ColumnBase
 
 
 BYTE_SIZES = {
@@ -55,31 +69,202 @@
 }
 
 
+@acquire_spill_lock()
+def _plc_write_parquet(
+    table,
+    filepaths_or_buffers,
+    index: bool | None = None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
+    partitions_info=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = False,
+) -> np.ndarray | None:
+    """
+    Cython function to call into libcudf API, see `write_parquet`.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+    if index is True or (
+        index is None and not isinstance(table.index, cudf.RangeIndex)
+    ):
+        columns = itertools.chain(table.index._columns, table._columns)
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        for level, idx_name in enumerate(table.index.names):
+            tbl_meta.column_metadata[level].set_name(
+                ioutils._index_level_name(idx_name, level, table._column_names)
+            )
+        num_index_cols_meta = len(table.index.names)
+    else:
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        num_index_cols_meta = 0
+
+    for i, name in enumerate(table._column_names, num_index_cols_meta):
+        if not isinstance(name, str):
+            if cudf.get_option("mode.pandas_compatible"):
+                tbl_meta.column_metadata[i].set_name(str(name))
+            else:
+                raise ValueError(
+                    "Writing a Parquet file requires string column names"
+                )
+        else:
+            tbl_meta.column_metadata[i].set_name(name)
+
+        _set_col_metadata(
+            table[name]._column,
+            tbl_meta.column_metadata[i],
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    if partitions_info is not None:
+        user_data = [
+            {
+                "pandas": ioutils.generate_pandas_metadata(
+                    table.iloc[start_row : start_row + num_row].copy(
+                        deep=False
+                    ),
+                    index,
+                )
+            }
+            for start_row, num_row in partitions_info
+        ]
+    else:
+        user_data = [
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
+        ]
+
+    if header_version not in ("1.0", "2.0"):
+        raise ValueError(
+            f"Invalid parquet header version: {header_version}. "
+            "Valid values are '1.0' and '2.0'"
+        )
+
+    dict_policy = (
+        plc.io.types.DictionaryPolicy.ADAPTIVE
+        if use_dictionary
+        else plc.io.types.DictionaryPolicy.NEVER
+    )
+
+    comp_type = _get_comp_type(compression)
+    stat_freq = _get_stat_freq(statistics)
+    options = (
+        plc.io.parquet.ParquetWriterOptions.builder(
+            plc.io.SinkInfo(filepaths_or_buffers), plc_table
+        )
+        .metadata(tbl_meta)
+        .key_value_metadata(user_data)
+        .compression(comp_type)
+        .stats_level(stat_freq)
+        .int96_timestamps(int96_timestamps)
+        .write_v2_headers(header_version == "2.0")
+        .dictionary_policy(dict_policy)
+        .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
+        .build()
+    )
+    if partitions_info is not None:
+        options.set_partitions(
+            [
+                plc.io.types.PartitionInfo(part[0], part[1])
+                for part in partitions_info
+            ]
+        )
+    if metadata_file_path is not None:
+        if is_list_like(metadata_file_path):
+            options.set_column_chunks_file_paths(metadata_file_path)
+        else:
+            options.set_column_chunks_file_paths([metadata_file_path])
+    if row_group_size_bytes is not None:
+        options.set_row_group_size_bytes(row_group_size_bytes)
+    if row_group_size_rows is not None:
+        options.set_row_group_size_rows(row_group_size_rows)
+    if max_page_size_bytes is not None:
+        options.set_max_page_size_bytes(max_page_size_bytes)
+    if max_page_size_rows is not None:
+        options.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        options.set_max_dictionary_size(max_dictionary_size)
+    blob = plc.io.parquet.write_parquet(options)
+    if metadata_file_path is not None:
+        return np.asarray(blob.obj)
+    else:
+        return None
+
+
 @_performance_tracking
 def _write_parquet(
     df,
     paths,
-    compression="snappy",
-    index=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     partitions_info=None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
-    write_arrow_schema=True,
-):
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+    write_arrow_schema: bool = True,
+) -> np.ndarray | None:
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
             ValueError("partition info is required for multiple paths")
@@ -124,11 +309,11 @@ def _write_parquet(
             file_objs = [
                 ioutils.get_IOBase_writer(file_obj) for file_obj in fsspec_objs
             ]
-            write_parquet_res = libparquet.write_parquet(
+            write_parquet_res = _plc_write_parquet(
                 df, filepaths_or_buffers=file_objs, **common_args
             )
     else:
-        write_parquet_res = libparquet.write_parquet(
+        write_parquet_res = _plc_write_parquet(
             df, filepaths_or_buffers=paths_or_bufs, **common_args
         )
 
@@ -141,26 +326,38 @@ def _write_parquet(
 def write_to_dataset(
     df,
     root_path,
-    compression="snappy",
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
     filename=None,
     partition_cols=None,
     fs=None,
-    preserve_index=False,
-    return_metadata=False,
-    statistics="ROWGROUP",
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
+    preserve_index: bool = False,
+    return_metadata: bool = False,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
     storage_options=None,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
@@ -330,9 +527,29 @@ def write_to_dataset(
     return metadata
 
 
+def _parse_metadata(meta) -> tuple[bool, Any, Any]:
+    file_is_range_index = False
+    file_index_cols = None
+    file_column_dtype = None
+
+    if "index_columns" in meta and len(meta["index_columns"]) > 0:
+        file_index_cols = meta["index_columns"]
+
+        if (
+            isinstance(file_index_cols[0], dict)
+            and file_index_cols[0]["kind"] == "range"
+        ):
+            file_is_range_index = True
+    if "column_indexes" in meta and len(meta["column_indexes"]) == 1:
+        file_column_dtype = meta["column_indexes"][0]["numpy_type"]
+    return file_is_range_index, file_index_cols, file_column_dtype
+
+
 @ioutils.doc_read_parquet_metadata()
 @_performance_tracking
-def read_parquet_metadata(filepath_or_buffer):
+def read_parquet_metadata(
+    filepath_or_buffer,
+) -> tuple[int, int, list[Hashable], int, list[dict[str, int]]]:
     """{docstring}"""
 
     # List of filepaths or buffers
@@ -341,7 +558,39 @@ def read_parquet_metadata(filepath_or_buffer):
         bytes_per_thread=None,
     )
 
-    return libparquet.read_parquet_metadata(filepaths_or_buffers)
+    parquet_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+        plc.io.SourceInfo(filepaths_or_buffers)
+    )
+
+    # read all column names including index column, if any
+    col_names = [
+        info.name() for info in parquet_metadata.schema().root().children()
+    ]
+
+    index_col_names = set()
+    json_str = parquet_metadata.metadata()["pandas"]
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None:
+            columns = meta["columns"]
+            for idx_col in index_col:
+                for c in columns:
+                    if c["field_name"] == idx_col:
+                        index_col_names.add(idx_col)
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if len(index_col_names) >= 0:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    return (
+        parquet_metadata.num_rows(),
+        parquet_metadata.num_rowgroups(),
+        col_names,
+        len(col_names),
+        parquet_metadata.rowgroup_metadata(),
+    )
 
 
 @_performance_tracking
@@ -886,7 +1135,6 @@ def _parquet_to_frame(
                     dfs[-1][name] = column_empty(
                         row_count=_len,
                         dtype=_dtype,
-                        masked=True,
                     )
                 else:
                     dfs[-1][name] = as_column(
@@ -913,16 +1161,18 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
-    nrows=None,
-    skip_rows=None,
-    allow_mismatched_pq_schemas=False,
+    nrows: int | None = None,
+    skip_rows: int | None = None,
+    allow_mismatched_pq_schemas: bool = False,
     *args,
     **kwargs,
-):
+) -> cudf.DataFrame:
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
-        if kwargs:
+        if set(kwargs.keys()).difference(
+            set(("_chunk_read_limit", "_pass_read_limit"))
+        ):
             raise ValueError(
                 "cudf engine doesn't support the "
                 f"following keyword arguments: {list(kwargs.keys())}"
@@ -932,30 +1182,126 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
+        if nrows is None:
+            nrows = -1
+        if skip_rows is None:
+            skip_rows = 0
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.read_parquet_chunked(
+            # Note: If this function ever takes accepts filters
+            # allow_range_index needs to be False when a filter is passed
+            # (see read_parquet)
+            allow_range_index = columns is not None and len(columns) != 0
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+
+            reader = plc.io.parquet.ChunkedParquetReader(
+                options,
+                chunk_read_limit=kwargs.get("_chunk_read_limit", 0),
+                pass_read_limit=kwargs.get("_pass_read_limit", 1024000000),
+            )
+
+            tbl_w_meta = reader.read_chunk()
+            column_names = tbl_w_meta.column_names(include_children=False)
+            child_names = tbl_w_meta.child_names
+            per_file_user_data = tbl_w_meta.per_file_user_data
+            concatenated_columns = tbl_w_meta.tbl.columns()
+
+            # save memory
+            del tbl_w_meta
+
+            while reader.has_next():
+                tbl = reader.read_chunk().tbl
+
+                for i in range(tbl.num_columns()):
+                    concatenated_columns[i] = plc.concatenate.concatenate(
+                        [concatenated_columns[i], tbl._columns[i]]
+                    )
+                    # Drop residual columns to save memory
+                    tbl._columns[i] = None
+
+            data = {
+                name: Column.from_pylibcudf(col)
+                for name, col in zip(column_names, concatenated_columns)
+            }
+            df = cudf.DataFrame._from_data(data)
+            df = _process_metadata(
+                df,
+                column_names,
+                child_names,
+                per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
-                nrows=nrows if nrows is not None else -1,
-                skip_rows=skip_rows if skip_rows is not None else 0,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
+                allow_range_index,
+                use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
+            return df
         else:
-            if nrows is None:
-                nrows = -1
-            if skip_rows is None:
-                skip_rows = 0
-            return libparquet.read_parquet(
+            allow_range_index = True
+            filters = kwargs.get("filters", None)
+            if columns is not None and len(columns) == 0 or filters:
+                allow_range_index = False
+
+            options = (
+                plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(filepaths_or_buffers)
+                )
+                .use_pandas_metadata(use_pandas_metadata)
+                .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
+                .build()
+            )
+            if row_groups is not None:
+                options.set_row_groups(row_groups)
+            if nrows > -1:
+                options.set_num_rows(nrows)
+            if skip_rows != 0:
+                options.set_skip_rows(skip_rows)
+            if columns is not None:
+                options.set_columns(columns)
+            if filters is not None:
+                options.set_filter(filters)
+
+            tbl_w_meta = plc.io.parquet.read_parquet(options)
+            data = {
+                name: Column.from_pylibcudf(col)
+                for name, col in zip(
+                    tbl_w_meta.column_names(include_children=False),
+                    tbl_w_meta.columns,
+                    strict=True,
+                )
+            }
+
+            df = cudf.DataFrame._from_data(data)
+
+            df = _process_metadata(
+                df,
+                tbl_w_meta.column_names(include_children=False),
+                tbl_w_meta.child_names,
+                tbl_w_meta.per_file_user_data,
+                row_groups,
                 filepaths_or_buffers,
-                columns=columns,
-                row_groups=row_groups,
-                use_pandas_metadata=use_pandas_metadata,
+                allow_range_index,
+                use_pandas_metadata,
                 nrows=nrows,
                 skip_rows=skip_rows,
-                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
+            return df
     else:
         if (
             isinstance(filepaths_or_buffers, list)
@@ -980,28 +1326,40 @@ def to_parquet(
     df,
     path,
     engine="cudf",
-    compression="snappy",
-    index=None,
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+    index: bool | None = None,
     partition_cols=None,
     partition_file_name=None,
     partition_offsets=None,
-    statistics="ROWGROUP",
-    metadata_file_path=None,
-    int96_timestamps=False,
-    row_group_size_bytes=None,
-    row_group_size_rows=None,
-    max_page_size_bytes=None,
-    max_page_size_rows=None,
-    max_dictionary_size=None,
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+    metadata_file_path: str | None = None,
+    int96_timestamps: bool = False,
+    row_group_size_bytes: int | None = None,
+    row_group_size_rows: int | None = None,
+    max_page_size_bytes: int | None = None,
+    max_page_size_rows: int | None = None,
+    max_dictionary_size: int | None = None,
     storage_options=None,
-    return_metadata=False,
-    force_nullable_schema=False,
-    header_version="1.0",
-    use_dictionary=True,
-    skip_compression=None,
-    column_encoding=None,
-    column_type_length=None,
-    output_as_binary=None,
+    return_metadata: bool = False,
+    force_nullable_schema: bool = False,
+    header_version: Literal["1.0", "2.0"] = "1.0",
+    use_dictionary: bool = True,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
     store_schema=False,
     *args,
     **kwargs,
@@ -1114,10 +1472,11 @@ def to_parquet(
 
 
 @ioutils.doc_merge_parquet_filemetadata()
-def merge_parquet_filemetadata(filemetadata_list):
+def merge_parquet_filemetadata(filemetadata_list: list) -> np.ndarray:
     """{docstring}"""
-
-    return libparquet.merge_filemetadata(filemetadata_list)
+    return np.asarray(
+        plc.io.parquet.merge_row_group_metadata(filemetadata_list).obj
+    )
 
 
 def _generate_filename():
@@ -1205,10 +1564,207 @@ def _get_groups_and_offsets(
     return part_names, grouped_df, part_offsets
 
 
-ParquetWriter = libparquet.ParquetWriter
+class ParquetWriter:
+    """
+    ParquetWriter lets you incrementally write out a Parquet file from a series
+    of cudf tables
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, io.IOBase, os.PathLike, or list
+        File path or buffer to write to. The argument may also correspond
+        to a list of file paths or buffers.
+    index : bool or None, default None
+        If ``True``, include a dataframe's index(es) in the file output.
+        If ``False``, they will not be written to the file. If ``None``,
+        index(es) other than RangeIndex will be saved as columns.
+    compression : {'snappy', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
+    statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
+        Level at which column statistics should be included in file.
+    row_group_size_bytes: int, default ``uint64 max``
+        Maximum size of each stripe of the output.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
+    row_group_size_rows: int, default 1000000
+        Maximum number of rows of each stripe of the output.
+        By default, 1000000 (10^6 rows) will be used.
+    max_page_size_bytes: int, default 524288
+        Maximum uncompressed size of each page of the output.
+        By default, 524288 (512KB) will be used.
+    max_page_size_rows: int, default 20000
+        Maximum number of rows of each page of the output.
+        By default, 20000 will be used.
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
+
+    See Also
+    --------
+    cudf.io.parquet.write_parquet
+    """
+
+    def __init__(
+        self,
+        filepath_or_buffer,
+        index: bool | None = None,
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
+        row_group_size_bytes: int = int(np.iinfo(np.uint64).max),
+        row_group_size_rows: int = 1000000,
+        max_page_size_bytes: int = 524288,
+        max_page_size_rows: int = 20000,
+        max_dictionary_size: int = 1048576,
+        use_dictionary: bool = True,
+        store_schema: bool = False,
+    ):
+        filepaths_or_buffers = (
+            list(filepath_or_buffer)
+            if is_list_like(filepath_or_buffer)
+            else [filepath_or_buffer]
+        )
+        self.sink = plc.io.SinkInfo(filepaths_or_buffers)
+        self.statistics = statistics
+        self.compression = compression
+        self.index = index
+        self.initialized = False
+        self.row_group_size_bytes = row_group_size_bytes
+        self.row_group_size_rows = row_group_size_rows
+        self.max_page_size_bytes = max_page_size_bytes
+        self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.use_dictionary = use_dictionary
+        self.write_arrow_schema = store_schema
+
+    def write_table(self, table, partitions_info=None) -> None:
+        """Writes a single table to the file"""
+        if not self.initialized:
+            self._initialize_chunked_state(
+                table,
+                num_partitions=len(partitions_info) if partitions_info else 1,
+            )
+        if self.index is not False and (
+            table.index.name is not None
+            or isinstance(table.index, cudf.MultiIndex)
+        ):
+            columns = itertools.chain(table.index._columns, table._columns)
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in columns]
+            )
+        else:
+            plc_table = plc.Table(
+                [col.to_pylibcudf(mode="read") for col in table._columns]
+            )
+        self.writer.write(plc_table, partitions_info)
+
+    def close(self, metadata_file_path=None) -> np.ndarray | None:
+        if not self.initialized:
+            return None
+        column_chunks_file_paths = []
+        if metadata_file_path is not None:
+            if is_list_like(metadata_file_path):
+                column_chunks_file_paths = list(metadata_file_path)
+            else:
+                column_chunks_file_paths = [metadata_file_path]
+        blob = self.writer.close(column_chunks_file_paths)
+        if metadata_file_path is not None:
+            return np.asarray(blob.obj)
+        return None
+
+    def __enter__(self) -> Self:
+        return self
+
+    def __exit__(self, *args) -> None:
+        self.close()
+
+    def _initialize_chunked_state(
+        self, table, num_partitions: int = 1
+    ) -> None:
+        """Prepares all the values required to build the
+        chunked_parquet_writer_options and creates a writer
+        """
+
+        # Set the table_metadata
+        num_index_cols_meta = 0
+        plc_table = plc.Table(
+            [col.to_pylibcudf(mode="read") for col in table._columns]
+        )
+        self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+        if self.index is not False:
+            if isinstance(table.index, cudf.MultiIndex):
+                plc_table = plc.Table(
+                    [
+                        col.to_pylibcudf(mode="read")
+                        for col in itertools.chain(
+                            table.index._columns, table._columns
+                        )
+                    ]
+                )
+                self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                for level, idx_name in enumerate(table.index.names):
+                    self.tbl_meta.column_metadata[level].set_name(idx_name)
+                num_index_cols_meta = len(table.index.names)
+            else:
+                if table.index.name is not None:
+                    plc_table = plc.Table(
+                        [
+                            col.to_pylibcudf(mode="read")
+                            for col in itertools.chain(
+                                table.index._columns, table._columns
+                            )
+                        ]
+                    )
+                    self.tbl_meta = plc.io.types.TableInputMetadata(plc_table)
+                    self.tbl_meta.column_metadata[0].set_name(table.index.name)
+                    num_index_cols_meta = 1
+
+        for i, name in enumerate(table._column_names, num_index_cols_meta):
+            self.tbl_meta.column_metadata[i].set_name(name)
+            _set_col_metadata(
+                table[name]._column,
+                self.tbl_meta.column_metadata[i],
+            )
+
+        index = (
+            False if isinstance(table.index, cudf.RangeIndex) else self.index
+        )
+        user_data = [
+            {"pandas": ioutils.generate_pandas_metadata(table, index)}
+        ] * num_partitions
+        comp_type = _get_comp_type(self.compression)
+        stat_freq = _get_stat_freq(self.statistics)
+        dict_policy = (
+            plc.io.types.DictionaryPolicy.ADAPTIVE
+            if self.use_dictionary
+            else plc.io.types.DictionaryPolicy.NEVER
+        )
+        options = (
+            plc.io.parquet.ChunkedParquetWriterOptions.builder(self.sink)
+            .metadata(self.tbl_meta)
+            .key_value_metadata(user_data)
+            .compression(comp_type)
+            .stats_level(stat_freq)
+            .row_group_size_bytes(self.row_group_size_bytes)
+            .row_group_size_rows(self.row_group_size_rows)
+            .max_page_size_bytes(self.max_page_size_bytes)
+            .max_page_size_rows(self.max_page_size_rows)
+            .max_dictionary_size(self.max_dictionary_size)
+            .write_arrow_schema(self.write_arrow_schema)
+            .build()
+        )
+        options.set_dictionary_policy(dict_policy)
+        self.writer = plc.io.parquet.ParquetChunkedWriter.from_options(options)
+        self.initialized = True
 
 
-def _parse_bytes(s):
+def _parse_bytes(s: str) -> int:
     """Parse byte string to numbers
 
     Utility function vendored from Dask.
@@ -1345,8 +1901,8 @@ def __init__(
         path,
         partition_cols,
         index=None,
-        compression="snappy",
-        statistics="ROWGROUP",
+        compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None] = "snappy",
+        statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"] = "ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
         storage_options=None,
@@ -1370,9 +1926,7 @@ def __init__(
         self.partition_cols = partition_cols
         # Collection of `ParquetWriter`s, and the corresponding
         # partition_col values they're responsible for
-        self._chunked_writers: list[
-            tuple[libparquet.ParquetWriter, list[str], str]
-        ] = []
+        self._chunked_writers: list[tuple[ParquetWriter, list[str], str]] = []
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: dict[str, int] = {}
@@ -1563,3 +2117,257 @@ def _hive_dirname(name, val):
     if pd.isna(val):
         val = "__HIVE_DEFAULT_PARTITION__"
     return f"{name}={val}"
+
+
+def _set_col_metadata(
+    col: ColumnBase,
+    col_meta: plc.io.types.ColumnInMetadata,
+    force_nullable_schema: bool = False,
+    path: str | None = None,
+    skip_compression: set[Hashable] | None = None,
+    column_encoding: dict[
+        Hashable,
+        Literal[
+            "PLAIN",
+            "DICTIONARY",
+            "DELTA_BINARY_PACKED",
+            "DELTA_LENGTH_BYTE_ARRAY",
+            "DELTA_BYTE_ARRAY",
+            "BYTE_STREAM_SPLIT",
+            "USE_DEFAULT",
+        ],
+    ]
+    | None = None,
+    column_type_length: dict | None = None,
+    output_as_binary: set[Hashable] | None = None,
+) -> None:
+    need_path = (
+        skip_compression is not None
+        or column_encoding is not None
+        or column_type_length is not None
+        or output_as_binary is not None
+    )
+    name = col_meta.get_name() if need_path else None
+    full_path = (
+        path + "." + name if (path is not None and name is not None) else name
+    )
+
+    if force_nullable_schema:
+        # Only set nullability if `force_nullable_schema`
+        # is true.
+        col_meta.set_nullability(True)
+
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        encoding = column_encoding[full_path]
+        if encoding is None:
+            c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT
+        else:
+            enc = str(encoding).upper()
+            c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None)
+            if c_encoding is None:
+                raise ValueError("Unsupported `column_encoding` type")
+        col_meta.set_encoding(c_encoding)
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
+    if isinstance(col.dtype, cudf.StructDtype):
+        for i, (child_col, name) in enumerate(
+            zip(col.children, list(col.dtype.fields))
+        ):
+            col_meta.child(i).set_name(name)
+            _set_col_metadata(
+                child_col,
+                col_meta.child(i),
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary,
+            )
+    elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element")
+        _set_col_metadata(
+            col.children[1],
+            col_meta.child(1),
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary,
+        )
+    elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
+        col_meta.set_decimal_precision(col.dtype.precision)
+
+
+def _get_comp_type(
+    compression: Literal["snappy", "ZSTD", "ZLIB", "LZ4", None],
+) -> plc.io.types.CompressionType:
+    if compression is None:
+        return plc.io.types.CompressionType.NONE
+    result = getattr(plc.io.types.CompressionType, compression.upper(), None)
+    if result is None:
+        raise ValueError("Unsupported `compression` type")
+    return result
+
+
+def _get_stat_freq(
+    statistics: Literal["ROWGROUP", "PAGE", "COLUMN", "NONE"],
+) -> plc.io.types.StatisticsFreq:
+    result = getattr(
+        plc.io.types.StatisticsFreq, f"STATISTICS_{statistics.upper()}", None
+    )
+    if result is None:
+        raise ValueError("Unsupported `statistics_freq` type")
+    return result
+
+
+def _process_metadata(
+    df: cudf.DataFrame,
+    names: list[Hashable],
+    child_names: dict,
+    per_file_user_data: list,
+    row_groups,
+    filepaths_or_buffers,
+    allow_range_index: bool,
+    use_pandas_metadata: bool,
+    nrows: int = -1,
+    skip_rows: int = 0,
+) -> cudf.DataFrame:
+    ioutils._add_df_col_struct_names(df, child_names)
+    index_col = None
+    is_range_index = True
+    column_index_type = None
+    index_col_names = None
+    meta = None
+    for single_file in per_file_user_data:
+        if b"pandas" not in single_file:
+            continue
+        json_str = single_file[b"pandas"].decode("utf-8")
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(
+            meta
+        )
+        is_range_index &= file_is_range_index
+
+        if (
+            not file_is_range_index
+            and index_col is not None
+            and index_col_names is None
+        ):
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta["columns"]:
+                    if c["field_name"] == idx_col:
+                        index_col_names[idx_col] = c["name"]
+
+    if meta is not None:
+        # Book keep each column metadata as the order
+        # of `meta["columns"]` and `column_names` are not
+        # guaranteed to be deterministic and same always.
+        meta_data_per_column = {
+            col_meta["name"]: col_meta for col_meta in meta["columns"]
+        }
+
+        # update the decimal precision of each column
+        for col in names:
+            if isinstance(df._data[col].dtype, cudf.core.dtypes.DecimalDtype):
+                df._data[col].dtype.precision = meta_data_per_column[col][
+                    "metadata"
+                ]["precision"]
+
+    # Set the index column
+    if index_col is not None and len(index_col) > 0:
+        if is_range_index:
+            if not allow_range_index:
+                return df
+
+            if len(per_file_user_data) > 1:
+                range_index_meta = {
+                    "kind": "range",
+                    "name": None,
+                    "start": 0,
+                    "stop": len(df),
+                    "step": 1,
+                }
+            else:
+                range_index_meta = index_col[0]
+
+            if row_groups is not None:
+                per_file_metadata = [
+                    pa.parquet.read_metadata(
+                        # Pyarrow cannot read directly from bytes
+                        io.BytesIO(s) if isinstance(s, bytes) else s
+                    )
+                    for s in filepaths_or_buffers
+                ]
+
+                filtered_idx = []
+                for i, file_meta in enumerate(per_file_metadata):
+                    row_groups_i = []
+                    start = 0
+                    for row_group in range(file_meta.num_row_groups):
+                        stop = start + file_meta.row_group(row_group).num_rows
+                        row_groups_i.append((start, stop))
+                        start = stop
+
+                    for rg in row_groups[i]:
+                        filtered_idx.append(
+                            cudf.RangeIndex(
+                                start=row_groups_i[rg][0],
+                                stop=row_groups_i[rg][1],
+                                step=range_index_meta["step"],
+                            )
+                        )
+
+                if len(filtered_idx) > 0:
+                    idx = cudf.concat(filtered_idx)
+                else:
+                    idx = cudf.Index._from_column(
+                        cudf.core.column.column_empty(0)
+                    )
+            else:
+                start = range_index_meta["start"] + skip_rows  # type: ignore[operator]
+                stop = range_index_meta["stop"]
+                if nrows > -1:
+                    stop = start + nrows
+                idx = cudf.RangeIndex(
+                    start=start,
+                    stop=stop,
+                    step=range_index_meta["step"],
+                    name=range_index_meta["name"],
+                )
+
+            df.index = idx
+        elif set(index_col).issubset(names):
+            index_data = df[index_col]
+            actual_index_names = iter(index_col_names.values())
+            if index_data._num_columns == 1:
+                idx = cudf.Index._from_column(
+                    index_data._columns[0], name=next(actual_index_names)
+                )
+            else:
+                idx = cudf.MultiIndex.from_frame(
+                    index_data, names=list(actual_index_names)
+                )
+            df.drop(columns=index_col, inplace=True)
+            df.index = idx
+        else:
+            if use_pandas_metadata:
+                df.index.names = index_col
+
+    if df._num_columns == 0 and column_index_type is not None:
+        df._data.label_dtype = cudf.dtype(column_index_type)
+
+    return df
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 5ce738cae0e..5e266c5ff55 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "Compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 0b09cf7dc34..a1df2c7d857 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -692,8 +692,8 @@ def assert_frame_equal(
     )
 
     pd.testing.assert_index_equal(
-        left._data.to_pandas_index(),
-        right._data.to_pandas_index(),
+        left._data.to_pandas_index,
+        right._data.to_pandas_index,
         exact=check_column_type,
         check_names=check_names,
         check_exact=check_exact,
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc
new file mode 100644
index 00000000000..a0ea4fbbfc2
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc
new file mode 100644
index 00000000000..8a7969cdbbb
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl
index 1ec077d10f7..64e06f0631d 100644
Binary files a/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl and b/python/cudf/cudf/tests/data/pkl/stringColumnWithRangeIndex_cudf_23.12.pkl differ
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index db24fdd2a29..8e1dba858c3 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -950,3 +950,13 @@ def test_index_set_categories(ordered):
     expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
     result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
     assert_eq(result, expected)
+
+
+def test_categorical_interval_pandas_roundtrip():
+    expected = cudf.Series(cudf.interval_range(0, 5)).astype("category")
+    result = cudf.Series.from_pandas(expected.to_pandas())
+    assert_eq(result, expected)
+
+    expected = pd.Series(pd.interval_range(0, 5)).astype("category")
+    result = cudf.Series.from_pandas(expected).to_pandas()
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 65947efc2df..c3c9a1c5338 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -7,7 +7,6 @@
 import pytest
 
 import cudf
-from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal
@@ -489,9 +488,7 @@ def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data["a"]]
-    got_mask = mask_to_bools(
-        gd_data["a"]._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data["a"]._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
@@ -527,9 +524,7 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
 
     # check mask
     expect_mask = [x is not pd.NA for x in pd_data]
-    got_mask = mask_to_bools(
-        gd_data._column.base_mask, 0, len(gd_data)
-    ).values_host
+    got_mask = gd_data._column._get_mask_as_column().values_host
 
     np.testing.assert_array_equal(expect_mask, got_mask)
 
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 5cef077c18d..27ec4fcd1f3 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -64,7 +64,7 @@ def test_to_pandas_simple(simple_data):
     # Index([], dtype='object'), and `integer` for RangeIndex()
     # to ignore this `inferred_type` comparison, we pass exact=False.
     assert_eq(
-        ca.to_pandas_index(),
+        ca.to_pandas_index,
         pd.DataFrame(
             {key: value.values_host for key, value in simple_data.items()}
         ).columns,
@@ -75,7 +75,7 @@ def test_to_pandas_simple(simple_data):
 def test_to_pandas_multiindex(mi_data):
     ca = ColumnAccessor(mi_data, multiindex=True)
     assert_eq(
-        ca.to_pandas_index(),
+        ca.to_pandas_index,
         pd.DataFrame(
             {key: value.values_host for key, value in mi_data.items()}
         ).columns,
@@ -89,7 +89,7 @@ def test_to_pandas_multiindex_names():
         level_names=("foo", "bar"),
     )
     assert_eq(
-        ca.to_pandas_index(),
+        ca.to_pandas_index,
         pd.MultiIndex.from_tuples(
             (("a", "b"), ("c", "d")), names=("foo", "bar")
         ),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d04fd97dcbd..f3cf8e36a5b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import array as arr
 import contextlib
@@ -1440,6 +1440,7 @@ def test_assign_callable(mapping):
         "sha256",
         "sha384",
         "sha512",
+        "xxhash32",
         "xxhash64",
     ],
 )
@@ -1447,6 +1448,7 @@ def test_assign_callable(mapping):
 def test_dataframe_hash_values(nrows, method, seed):
     warning_expected = seed is not None and method not in {
         "murmur3",
+        "xxhash32",
         "xxhash64",
     }
     potential_warning = (
@@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         "sha256": object,
         "sha384": object,
         "sha512": object,
+        "xxhash32": np.uint32,
         "xxhash64": np.uint64,
     }
     assert out.dtype == expected_dtypes[method]
@@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
 
 
-@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
+@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"])
 def test_dataframe_hash_values_seed(method):
     gdf = cudf.DataFrame()
     data = np.arange(10)
@@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method):
     assert_neq(out_one, out_two)
 
 
+def test_dataframe_hash_values_xxhash32():
+    # xxhash32 has no built-in implementation in Python and we don't want to
+    # add a testing dependency, so we use regression tests against known good
+    # values.
+    gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
+    gdf["b"] = -gdf["a"]
+    out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
+    expected_a = cudf.Series(
+        [3736311059, 2307980487, 2906647130, 746578903, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_a, expected_a)
+
+    out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
+    expected_b = cudf.Series(
+        [1076387279, 2261349915, 531498073, 650869264, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_b, expected_b)
+
+    out_df = gdf.hash_values(method="xxhash32", seed=0)
+    expected_df = cudf.Series(
+        [1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_df, expected_df)
+
+
 def test_dataframe_hash_values_xxhash64():
     # xxhash64 has no built-in implementation in Python and we don't want to
     # add a testing dependency, so we use regression tests against known good
@@ -11193,3 +11224,32 @@ def test_dataframe_init_column():
     expect = cudf.DataFrame({"a": s})
     actual = cudf.DataFrame._from_arrays(s._column, columns=["a"])
     assert_eq(expect, actual)
+
+
+@pytest.mark.parametrize("name", [None, "foo", 1, 1.0])
+def test_dataframe_column_name(name):
+    df = cudf.DataFrame({"a": [1, 2, 3]})
+    pdf = df.to_pandas()
+
+    df.columns.name = name
+    pdf.columns.name = name
+
+    assert_eq(df, pdf)
+    assert_eq(df.columns.name, pdf.columns.name)
+
+
+@pytest.mark.parametrize("names", [["abc", "def"], [1, 2], ["abc", 10]])
+def test_dataframe_multiindex_column_names(names):
+    arrays = [["A", "A", "B", "B"], ["one", "two", "one", "two"]]
+    tuples = list(zip(*arrays))
+    index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
+
+    pdf = pd.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=index)
+    df = cudf.from_pandas(pdf)
+
+    assert_eq(df, pdf)
+    assert_eq(df.columns.names, pdf.columns.names)
+    pdf.columns.names = names
+    df.columns.names = names
+    assert_eq(df, pdf)
+    assert_eq(df.columns.names, pdf.columns.names)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index d8a2528230e..74593aa841f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -2472,11 +2472,8 @@ def test_groupby_unique(by, data, dtype):
 def test_groupby_2keys_scan(nelem, func):
     pdf = make_frame(pd.DataFrame, nelem=nelem)
     expect_df = pdf.groupby(["x", "y"], sort=True).agg(func)
-    got_df = (
-        make_frame(DataFrame, nelem=nelem)
-        .groupby(["x", "y"], sort=True)
-        .agg(func)
-    )
+    gdf = cudf.from_pandas(pdf)
+    got_df = gdf.groupby(["x", "y"], sort=True).agg(func)
     # pd.groupby.cumcount returns a series.
     if isinstance(expect_df, pd.Series):
         expect_df = expect_df.to_frame("val")
@@ -2484,6 +2481,18 @@ def test_groupby_2keys_scan(nelem, func):
     check_dtype = func not in _index_type_aggs
     assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
 
+    expect_df = getattr(pdf.groupby(["x", "y"], sort=True), func)()
+    got_df = getattr(gdf.groupby(["x", "y"], sort=True), func)()
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
+
+    expect_df = getattr(pdf.groupby(["x", "y"], sort=True)[["x"]], func)()
+    got_df = getattr(gdf.groupby(["x", "y"], sort=True)[["x"]], func)()
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
+
+    expect_df = getattr(pdf.groupby(["x", "y"], sort=True)["y"], func)()
+    got_df = getattr(gdf.groupby(["x", "y"], sort=True)["y"], func)()
+    assert_groupby_results_equal(got_df, expect_df, check_dtype=check_dtype)
+
 
 @pytest.mark.parametrize("nelem", [100, 1000])
 @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
@@ -3960,8 +3969,8 @@ def test_group_by_value_counts_with_count_column():
 def test_groupby_internal_groups_empty(gdf):
     # test that we don't segfault when calling the internal
     # .groups() method with an empty list:
-    gb = gdf.groupby("y")._groupby
-    _, _, grouped_vals = gb.groups([])
+    gb = gdf.groupby("y")
+    _, _, grouped_vals = gb._groups([])
     assert grouped_vals == []
 
 
@@ -4076,6 +4085,13 @@ def test_get_group_list_like():
         df.groupby(["a"]).get_group([1])
 
 
+def test_get_group_list_like_len_2():
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [3, 2, 1]})
+    result = df.groupby(["a", "b"]).get_group((1, 4))
+    expected = df.to_pandas().groupby(["a", "b"]).get_group((1, 4))
+    assert_eq(result, expected)
+
+
 def test_size_as_index_false():
     df = pd.DataFrame({"a": [1, 2, 1], "b": [1, 2, 3]}, columns=["a", "b"])
     expected = df.groupby("a", as_index=False).size()
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index aaa8d7d07ee..db34329261f 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1453,3 +1453,12 @@ def test_chunked_json_reader():
     with cudf.option_context("io.json.low_memory", True):
         gdf = cudf.read_json(buf, lines=True)
     assert_eq(df, gdf)
+
+
+@pytest.mark.parametrize("compression", ["gzip", None])
+def test_roundtrip_compression(compression, tmp_path):
+    expected = cudf.DataFrame({"a": 1, "b": "2"})
+    fle = BytesIO()
+    expected.to_json(fle, engine="cudf", compression=compression)
+    result = cudf.read_json(fle, engine="cudf", compression=compression)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 260b481b933..b1f81edfc54 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import functools
 import operator
@@ -10,11 +10,11 @@
 
 import cudf
 from cudf import NA
-from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
 from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
+from cudf.utils.dtypes import cudf_dtype_to_pa_type
 
 
 @pytest.mark.parametrize(
@@ -424,7 +424,9 @@ def test_get_ind_sequence():
 def test_contains_scalar(data, scalar, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect)
-    got = sr.list.contains(cudf.Scalar(scalar, sr.dtype.element_type))
+    got = sr.list.contains(
+        pa.scalar(scalar, type=cudf_dtype_to_pa_type(sr.dtype.element_type))
+    )
     assert_eq(expect, got)
 
 
@@ -456,7 +458,9 @@ def test_contains_scalar(data, scalar, expect):
 def test_contains_null_search_key(data, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="bool")
-    got = sr.list.contains(cudf.Scalar(cudf.NA, sr.dtype.element_type))
+    got = sr.list.contains(
+        pa.scalar(None, type=cudf_dtype_to_pa_type(sr.dtype.element_type))
+    )
     assert_eq(expect, got)
 
 
@@ -519,12 +523,12 @@ def test_contains_invalid(data, scalar):
         ),
         (
             [["d", None, "e"], [None, "f"], []],
-            cudf.Scalar(cudf.NA, "O"),
+            pa.scalar(None, type=pa.string()),
             [None, None, None],
         ),
         (
             [None, [10, 9, 8], [5, 8, None]],
-            cudf.Scalar(cudf.NA, "int64"),
+            pa.scalar(None, type=pa.int64()),
             [None, None, None],
         ),
     ],
@@ -533,7 +537,11 @@ def test_index(data, search_key, expect):
     sr = cudf.Series(data)
     expect = cudf.Series(expect, dtype="int32")
     if is_scalar(search_key):
-        got = sr.list.index(cudf.Scalar(search_key, sr.dtype.element_type))
+        got = sr.list.index(
+            pa.scalar(
+                search_key, type=cudf_dtype_to_pa_type(sr.dtype.element_type)
+            )
+        )
     else:
         got = sr.list.index(
             cudf.Series(search_key, dtype=sr.dtype.element_type)
@@ -715,9 +723,8 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
     ],
 )
 def test_list_scalar_device_construction(data):
-    col = cudf.Series([data])._column
-    slr = get_element(col, 0)
-    assert slr.value == data
+    res = cudf.Series([data])._column.element_indexing(0)
+    assert res == data
 
 
 @pytest.mark.parametrize("nesting_level", [1, 2, 3])
@@ -729,10 +736,8 @@ def test_list_scalar_device_construction_null(nesting_level):
     arrow_type = pa.infer_type(data)
     arrow_arr = pa.array([None], type=arrow_type)
 
-    col = cudf.Series(arrow_arr)._column
-    slr = get_element(col, 0)
-
-    assert slr.value is cudf.NA
+    res = cudf.Series(arrow_arr)._column.element_indexing(0)
+    assert res is cudf.NA
 
 
 @pytest.mark.parametrize("input_obj", [[[1, NA, 3]], [[1, NA, 3], [4, 5, NA]]])
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c4b4ef60184..fe143e66407 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir):
     got = cudf.read_orc(buffer)
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "inputfile",
+    [
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+    ],
+)
+def test_orc_reader_desynced_timestamp(datadir, inputfile):
+    # Test a special case where the DATA stream (second) in a TIMESTAMP column
+    # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row
+    # group. In this case, the "run cache manager" in the decoder kernel is used to
+    # orchestrate the dual-stream processing.
+    # For more information, see https://github.com/rapidsai/cudf/issues/17155.
+
+    path = datadir / inputfile
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_frame_equal(cudf.from_pandas(expect), got)
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
deleted file mode 100644
index b474bbe9bd8..00000000000
--- a/python/cudf/cudf/tests/test_pack.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pickle
-import sys
-
-import numpy as np
-import pandas as pd
-
-from cudf import DataFrame, Index, Series
-from cudf._lib.copying import pack, unpack
-from cudf.testing import assert_eq
-
-
-def test_sizeof_packed_dataframe():
-    rng = np.random.default_rng(seed=0)
-    df = DataFrame()
-    nelem = 1000
-    df["keys"] = hkeys = np.arange(nelem, dtype=np.float64)
-    df["vals"] = hvals = rng.random(nelem)
-    packed = pack(df)
-
-    nbytes = hkeys.nbytes + hvals.nbytes
-    sizeof = sys.getsizeof(packed)
-    assert sizeof < nbytes
-
-    serialized_nbytes = len(
-        pickle.dumps(packed, protocol=pickle.HIGHEST_PROTOCOL)
-    )
-
-    # assert at least sizeof bytes were serialized
-    assert serialized_nbytes >= sizeof
-
-
-def check_packed_equality(df):
-    # basic
-    assert_packed_frame_equality(df)
-    # sliced
-    assert_packed_frame_equality(df[:-1])
-    assert_packed_frame_equality(df[1:])
-    assert_packed_frame_equality(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_equality(sortvaldf)
-
-
-def assert_packed_frame_equality(df):
-    pdf = df.to_pandas()
-
-    packed = pack(df)
-    del df
-    unpacked = unpack(packed)
-
-    assert_eq(unpacked, pdf)
-
-
-def test_packed_dataframe_equality_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def test_packed_dataframe_equality_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_equality(df)
-
-
-def check_packed_unique_pointers(df):
-    # basic
-    assert_packed_frame_unique_pointers(df)
-    # sliced
-    assert_packed_frame_unique_pointers(df[:-1])
-    assert_packed_frame_unique_pointers(df[1:])
-    assert_packed_frame_unique_pointers(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_unique_pointers(sortvaldf)
-
-
-def assert_packed_frame_unique_pointers(df):
-    unpacked = unpack(pack(df))
-
-    for col in df:
-        if df._data[col].data:
-            assert df._data[col].data.get_ptr(mode="read") != unpacked._data[
-                col
-            ].data.get_ptr(mode="read")
-
-
-def test_packed_dataframe_unique_pointers_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def test_packed_dataframe_unique_pointers_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_unique_pointers(df)
-
-
-def check_packed_pickled_equality(df):
-    # basic
-    assert_packed_frame_picklable(df)
-    # sliced
-    assert_packed_frame_picklable(df[:-1])
-    assert_packed_frame_picklable(df[1:])
-    assert_packed_frame_picklable(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_picklable(sortvaldf)
-    # out-of-band
-    buffers = []
-    serialbytes = pickle.dumps(
-        pack(df), protocol=5, buffer_callback=buffers.append
-    )
-    for b in buffers:
-        assert isinstance(b, pickle.PickleBuffer)
-    loaded = unpack(pickle.loads(serialbytes, buffers=buffers))
-    assert_eq(loaded, df)
-
-
-def assert_packed_frame_picklable(df):
-    serialbytes = pickle.dumps(pack(df))
-    loaded = unpack(pickle.loads(serialbytes))
-    assert_eq(loaded, df)
-
-
-def test_pickle_packed_dataframe_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def test_pickle_packed_dataframe_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_pickled_equality(df)
-
-
-def check_packed_serialized_equality(df):
-    # basic
-    assert_packed_frame_serializable(df)
-    # sliced
-    assert_packed_frame_serializable(df[:-1])
-    assert_packed_frame_serializable(df[1:])
-    assert_packed_frame_serializable(df[2:-2])
-    # sorted
-    sortvaldf = df.sort_values("vals")
-    assert isinstance(sortvaldf.index, Index)
-    assert_packed_frame_serializable(sortvaldf)
-
-
-def assert_packed_frame_serializable(df):
-    packed = pack(df)
-    header, frames = packed.serialize()
-    loaded = unpack(packed.deserialize(header, frames))
-    assert_eq(loaded, df)
-
-
-def test_serialize_packed_dataframe_numeric():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    nelem = 10
-    df["keys"] = np.arange(nelem, dtype=np.float64)
-    df["vals"] = rng.random(nelem)
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_categorical():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = pd.Categorical(
-        ["a", "a", "a", "b", "a", "b", "a", "b", "a", "c"]
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_list():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(list([i, i + 1, i + 2] for i in range(10)))
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
-
-
-def test_serialize_packed_dataframe_struct():
-    rng = np.random.default_rng(seed=0)
-
-    df = DataFrame()
-    df["keys"] = Series(
-        list({"0": i, "1": i + 1, "2": i + 2} for i in range(10))
-    )
-    df["vals"] = rng.random(len(df))
-
-    check_packed_serialized_equality(df)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 13efa71ebae..77d1f77d30b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,6 @@
 from pyarrow import parquet as pq
 
 import cudf
-from cudf._lib.parquet import read_parquet_chunked
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.parquet import (
     ParquetDatasetWriter,
@@ -3775,13 +3774,14 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer, row_group_size=10000)
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        use_pandas_metadata=use_pandas_metadata,
-        row_groups=row_groups,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            use_pandas_metadata=use_pandas_metadata,
+            row_groups=row_groups,
+        )
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
@@ -3825,12 +3825,13 @@ def test_parquet_chunked_reader_structs(
     # Number of rows to read
     nrows = num_rows if num_rows is not None else len(df)
 
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3877,12 +3878,13 @@ def test_parquet_chunked_reader_string_decoders(
     nrows = num_rows if num_rows is not None else len(df)
 
     # Check with num_rows specified
-    actual = read_parquet_chunked(
-        [buffer],
-        chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit,
-        nrows=nrows,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        actual = cudf.read_parquet(
+            [buffer],
+            _chunk_read_limit=chunk_read_limit,
+            _pass_read_limit=pass_read_limit,
+            nrows=nrows,
+        )
     expected = cudf.read_parquet(
         buffer,
         nrows=nrows,
@@ -3982,13 +3984,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     ).reset_index(drop=True)
 
     # Read with chunked reader (filter columns not supported)
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["list", "d_list", "str"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["list", "d_list", "str"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
 
     # Construct the expected table without filter columns
     expected_chunked = cudf.concat(
@@ -4054,13 +4057,14 @@ def test_parquet_reader_with_mismatched_structs():
     )
 
     # Read with chunked reader
-    got_chunked = read_parquet_chunked(
-        [buf1, buf2],
-        columns=["struct.b.b_b.b_b_a"],
-        chunk_read_limit=240,
-        pass_read_limit=240,
-        allow_mismatched_pq_schemas=True,
-    )
+    with cudf.option_context("io.parquet.low_memory", True):
+        got_chunked = cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b.b_b.b_b_a"],
+            _chunk_read_limit=240,
+            _pass_read_limit=240,
+            allow_mismatched_pq_schemas=True,
+        )
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 9a2816f5444..84de2ac38e7 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -91,3 +91,19 @@ def test_quantile_type_int_float(interpolation):
 
     assert expected == actual
     assert type(expected) is type(actual)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [float("nan"), float("nan"), 0.9],
+        [float("nan"), float("nan"), float("nan")],
+    ],
+)
+def test_ignore_nans(data):
+    psr = pd.Series(data)
+    gsr = cudf.Series(data, nan_as_null=False)
+
+    expected = gsr.quantile(0.9)
+    result = psr.quantile(0.9)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index bf0c97adb00..2cb742727cc 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import textwrap
 
@@ -618,9 +618,9 @@ def test_timedelta_series_s_us_repr(data, dtype):
             cudf.Series([1000000, 200000, 3000000], dtype="timedelta64[ns]"),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.001000000
-            1    0 days 00:00:00.000200000
-            2    0 days 00:00:00.003000000
+            0    0 days 00:00:00.001000
+            1    0 days 00:00:00.000200
+            2    0 days 00:00:00.003000
             dtype: timedelta64[ns]
             """
             ),
@@ -710,12 +710,12 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.012
-            1    0 days 00:00:00.012
-            2    0 days 00:00:00.022
-            3    0 days 00:00:00.343
-            4    0 days 01:12:33.534
-            5    0 days 00:07:15.342
+            0    0 days 00:00:00.012000
+            1    0 days 00:00:00.012000
+            2    0 days 00:00:00.022000
+            3    0 days 00:00:00.343000
+            4    0 days 01:12:33.534000
+            5    0 days 00:07:15.342000
             dtype: timedelta64[ms]
             """
             ),
@@ -745,13 +745,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    0 days 00:00:00.001
-            1    0 days 00:00:01.132
-            2    0 days 06:27:03.231
-            3    0 days 00:00:00.233
-            4        0 days 00:00:00
-            5    0 days 00:00:00.332
-            6    0 days 00:00:00.323
+            0    0 days 00:00:00.001000
+            1    0 days 00:00:01.132000
+            2    0 days 06:27:03.231000
+            3    0 days 00:00:00.233000
+            4           0 days 00:00:00
+            5    0 days 00:00:00.332000
+            6    0 days 00:00:00.323000
             dtype: timedelta64[ms]
             """
             ),
@@ -771,13 +771,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    157937 days 02:23:52.432
-            1         1 days 13:25:36.784
-            2         2 days 20:09:05.345
-            3         2 days 14:03:52.411
-            4     11573 days 23:39:03.241
-            5        42 days 01:35:48.734
-            6         0 days 00:00:23.234
+            0    157937 days 02:23:52.432000
+            1         1 days 13:25:36.784000
+            2         2 days 20:09:05.345000
+            3         2 days 14:03:52.411000
+            4     11573 days 23:39:03.241000
+            5        42 days 01:35:48.734000
+            6         0 days 00:00:23.234000
             dtype: timedelta64[ms]
             """
             ),
@@ -824,13 +824,13 @@ def test_timedelta_series_s_us_repr(data, dtype):
             ),
             textwrap.dedent(
                 """
-            0    157937 days 02:23:52.432
-            1         1 days 13:25:36.784
-            2         2 days 20:09:05.345
-            3         2 days 14:03:52.411
-            4     11573 days 23:39:03.241
-            5        42 days 01:35:48.734
-            6         0 days 00:00:23.234
+            0    157937 days 02:23:52.432000
+            1         1 days 13:25:36.784000
+            2         2 days 20:09:05.345000
+            3         2 days 14:03:52.411000
+            4     11573 days 23:39:03.241000
+            5        42 days 01:35:48.734000
+            6         0 days 00:00:23.234000
             Name: abc, dtype: timedelta64[ms]
             """
             ),
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index fcd98831686..c14fab4040b 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -10,10 +10,11 @@
 import pytest
 from packaging import version
 
+import pylibcudf as plc
 import rmm
 
 import cudf
-from cudf._lib.copying import get_element
+from cudf.core.buffer import acquire_spill_lock
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
@@ -143,8 +144,14 @@ def test_scalar_host_initialization(value):
 @pytest.mark.parametrize("value", SCALAR_VALUES)
 def test_scalar_device_initialization(value):
     column = cudf.Series([value], nan_as_null=False)._column
-    dev_slr = get_element(column, 0)
-
+    with acquire_spill_lock():
+        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
+            plc.copying.get_element(
+                column.to_pylibcudf(mode="read"),
+                0,
+            ),
+            dtype=column.dtype,
+        )
     s = cudf.Scalar.from_device_scalar(dev_slr)
 
     assert s._is_device_value_current
@@ -164,8 +171,14 @@ def test_scalar_device_initialization(value):
 def test_scalar_device_initialization_decimal(value, decimal_type):
     dtype = decimal_type._from_decimal(value)
     column = cudf.Series([str(value)]).astype(dtype)._column
-    dev_slr = get_element(column, 0)
-
+    with acquire_spill_lock():
+        dev_slr = cudf._lib.scalar.DeviceScalar.from_pylibcudf(
+            plc.copying.get_element(
+                column.to_pylibcudf(mode="read"),
+                0,
+            ),
+            dtype=column.dtype,
+        )
     s = cudf.Scalar.from_device_scalar(dev_slr)
 
     assert s._is_device_value_current
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index 68f2aaf9cab..b50ed04427f 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+from packaging import version
 
 import cudf
 from cudf.testing import _utils as utils, assert_eq
@@ -149,13 +150,19 @@ def test_serialize(df, to_host):
 
 def test_serialize_dtype_error_checking():
     dtype = cudf.IntervalDtype("float", "right")
-    header, frames = dtype.serialize()
-    with pytest.raises(AssertionError):
-        # Invalid number of frames
-        type(dtype).deserialize(header, [None] * (header["frame_count"] + 1))
+    # Must call device_serialize (not serialize) to ensure that the type metadata is
+    # encoded in the header.
+    header, frames = dtype.device_serialize()
     with pytest.raises(AssertionError):
         # mismatching class
         cudf.StructDtype.deserialize(header, frames)
+    # The is-cuda flag list length must match the number of frames
+    header["is-cuda"] = [False]
+    with pytest.raises(AssertionError):
+        # Invalid number of frames
+        type(dtype).deserialize(
+            header, [np.zeros(1)] * (header["frame_count"] + 1)
+        )
 
 
 def test_serialize_dataframe():
@@ -382,6 +389,10 @@ def test_serialize_string_check_buffer_sizes():
     assert expect == got
 
 
+@pytest.mark.skipif(
+    version.parse(np.__version__) < version.parse("2.0.0"),
+    reason="The serialization of numpy 2.0 types is incompatible with numpy 1.x",
+)
 def test_deserialize_cudf_23_12(datadir):
     fname = datadir / "pkl" / "stringColumnWithRangeIndex_cudf_23.12.pkl"
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 99bd9adb034..f8697c5c6b8 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -772,6 +772,69 @@ def test_round_nan_as_null_false(series, decimal):
     assert_eq(result, expected, atol=1e-10)
 
 
+@pytest.mark.parametrize(
+    "data, dtype, decimals, expected_half_up, expected_half_even",
+    [
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.34, 3.46],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            0,
+            [1.0, 2.0, 3.0],
+            [1.0, 2.0, 3.0],
+        ),
+        (
+            [1.234, 2.345, 3.456],
+            cudf.Decimal32Dtype(precision=5, scale=3),
+            3,
+            [1.234, 2.345, 3.456],
+            [1.234, 2.345, 3.456],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            4,
+            [1.2346, 2.3457, 3.4568],
+            [1.2346, 2.3457, 3.4568],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            2,
+            [1.23, 2.35, 3.46],
+            [1.23, 2.35, 3.46],
+        ),
+        (
+            [1.234567, 2.345678, 3.456789],
+            cudf.Decimal64Dtype(precision=10, scale=6),
+            6,
+            [1.234567, 2.345678, 3.456789],
+            [1.234567, 2.345678, 3.456789],
+        ),
+    ],
+)
+def test_series_round_decimal(
+    data, dtype, decimals, expected_half_up, expected_half_even
+):
+    ser = cudf.Series(data).astype(dtype)
+
+    result_half_up = ser.round(decimals=decimals, how="half_up").astype(dtype)
+    expected_ser_half_up = cudf.Series(expected_half_up).astype(dtype)
+    assert_eq(result_half_up, expected_ser_half_up)
+
+    result_half_even = ser.round(decimals=decimals, how="half_even").astype(
+        dtype
+    )
+    expected_ser_half_even = cudf.Series(expected_half_even).astype(dtype)
+    assert_eq(result_half_even, expected_ser_half_even)
+
+
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 899d78c999b..b85943626a6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -79,7 +79,7 @@ def test_series_construction_with_nulls():
 )
 def test_serialize_struct_dtype(fields):
     dtype = cudf.StructDtype(fields)
-    recreated = dtype.__class__.deserialize(*dtype.serialize())
+    recreated = dtype.__class__.device_deserialize(*dtype.device_serialize())
     assert recreated == dtype
 
 
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index d622ff6b94e..f1da2a060ec 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import datetime
 import operator
@@ -1506,3 +1506,25 @@ def test_tdi_unit():
     result = pd_tdi.unit
     expected = cudf_tdi.unit
     assert result == expected
+
+
+@pytest.mark.parametrize("data", _TIMEDELTA_DATA)
+@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
+def test_timedelta_series_total_seconds(data, dtype):
+    gsr = cudf.Series(data, dtype=dtype)
+    psr = gsr.to_pandas()
+
+    expected = psr.dt.total_seconds()
+    actual = gsr.dt.total_seconds()
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", _TIMEDELTA_DATA)
+@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
+def test_timedelta_index_total_seconds(request, data, dtype):
+    gi = cudf.Index(data, dtype=dtype)
+    pi = gi.to_pandas()
+
+    expected = pi.total_seconds()
+    actual = gi.total_seconds()
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 3637ef075f2..9a62285403f 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,7 +882,7 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash_permuted():
+def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
     params = cudf.Series([1, 2, 3], dtype=np.uint32)
@@ -894,7 +894,7 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     params = cudf.Series([1, 2, 3], dtype=np.uint64)
@@ -912,78 +912,18 @@ def test_minhash_permuted():
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
+    actual = strings.str.minhash64(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash_permuted(1, a="a", b="b", width=7)
+        strings.str.minhash(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash_permuted(1, a=params, b=params, width=6)
+        strings.str.minhash(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
         params = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
-
-
-def test_word_minhash():
-    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
-
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
-            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
-        ]
-    )
-    actual = ls.str.word_minhash(seeds=seeds)
-    assert_eq(expected, actual)
-
-    expected = cudf.Series(
-        [
-            cudf.Series([2603139454418834912], dtype=np.uint64),
-            cudf.Series([5240044617220523711], dtype=np.uint64),
-        ]
-    )
-    actual = ls.str.word_minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
-    expected = cudf.Series(
-        [
-            cudf.Series(
-                [
-                    2603139454418834912,
-                    8644371945174847701,
-                    5541030711534384340,
-                ],
-                dtype=np.uint64,
-            ),
-            cudf.Series(
-                [5240044617220523711, 5847101123925041457, 153762819128779913],
-                dtype=np.uint64,
-            ),
-        ]
-    )
-    actual = ls.str.word_minhash64(seeds=seeds)
-    assert_eq(expected, actual)
-
-    # test wrong seed types
-    with pytest.raises(ValueError):
-        ls.str.word_minhash(seeds="a")
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        ls.str.word_minhash(seeds=seeds)
-    with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        ls.str.word_minhash64(seeds=seeds)
+        strings.str.minhash64(1, a=params, b=params, width=8)
 
 
 def test_jaccard_index():
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index d9dde58d998..574170d28c6 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -130,9 +130,7 @@ def _setup_numba():
             if driver_version < (12, 0):
                 patch_numba_linker_cuda_11()
             else:
-                from pynvjitlink.patch import patch_numba_linker
-
-                patch_numba_linker()
+                numba_config.CUDA_ENABLE_PYNVJITLINK = True
 
 
 class _CUDFNumbaConfig:
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 57bf08e6eec..31a8f4de3b3 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -151,7 +151,7 @@ def cudf_dtype_from_pydata_dtype(dtype):
         return cudf.core.dtypes.Decimal64Dtype
     elif cudf.api.types.is_decimal128_dtype(dtype):
         return cudf.core.dtypes.Decimal128Dtype
-    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_LIBCUDF_TYPES:
+    elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES:
         return dtype.type
 
     return infer_dtype_from_object(dtype)
@@ -198,7 +198,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
     If `val` is None, returns None.
     """
 
-    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
+    if cudf.utils.utils._is_null_host_scalar(val) or isinstance(
         val, cudf.Scalar
     ):
         return val
diff --git a/python/cudf/cudf/utils/gpu_utils.py b/python/cudf/cudf/utils/gpu_utils.py
index b5387ddeb5f..10ab3f6bb1e 100644
--- a/python/cudf/cudf/utils/gpu_utils.py
+++ b/python/cudf/cudf/utils/gpu_utils.py
@@ -15,7 +15,7 @@ def validate_setup():
 
     import warnings
 
-    from cuda.cudart import cudaDeviceAttr, cudaError_t
+    from cuda.bindings.runtime import cudaDeviceAttr, cudaError_t
 
     from rmm._cuda.gpu import (
         CUDARuntimeError,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 5681601d2be..a04fcb8df7a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -3,39 +3,46 @@
 
 import datetime
 import functools
+import json
 import operator
 import os
 import urllib
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import fsspec
 import fsspec.implementations.local
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
 import cudf
 from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
+from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes, np_to_pa_dtype
 
 try:
     import fsspec.parquet as fsspec_parquet
-
 except ImportError:
     fsspec_parquet = None
 
+
 if TYPE_CHECKING:
-    from collections.abc import Callable
+    from collections.abc import Callable, Hashable
 
     from cudf.core.column import ColumnBase
 
 
+PARQUET_META_TYPE_MAP = {
+    str(cudf_dtype): str(pandas_dtype)
+    for cudf_dtype, pandas_dtype in np_dtypes_to_pandas_dtypes.items()
+}
+
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -1487,6 +1494,153 @@
 )
 
 
+def _index_level_name(
+    index_name: Hashable, level: int, column_names: list[Hashable]
+) -> Hashable:
+    """
+    Return the name of an index level or a default name
+    if `index_name` is None or is already a column name.
+
+    Parameters
+    ----------
+    index_name : name of an Index object
+    level : level of the Index object
+
+    Returns
+    -------
+    name : str
+    """
+    if index_name is not None and index_name not in column_names:
+        return index_name
+    else:
+        return f"__index_level_{level}__"
+
+
+def generate_pandas_metadata(table: cudf.DataFrame, index: bool | None) -> str:
+    col_names: list[Hashable] = []
+    types = []
+    index_levels = []
+    index_descriptors = []
+    columns_to_convert = list(table._columns)
+    # Columns
+    for name, col in table._column_labels_and_values:
+        if cudf.get_option("mode.pandas_compatible"):
+            # in pandas-compat mode, non-string column names are stringified.
+            col_names.append(str(name))
+        else:
+            col_names.append(name)
+
+        if isinstance(col.dtype, cudf.CategoricalDtype):
+            raise ValueError(
+                "'category' column dtypes are currently not "
+                + "supported by the gpu accelerated parquet writer"
+            )
+        elif isinstance(
+            col.dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.core.dtypes.DecimalDtype),
+        ):
+            types.append(col.dtype.to_arrow())
+        else:
+            # A boolean element takes 8 bits in cudf and 1 bit in
+            # pyarrow. To make sure the cudf format is interoperable
+            # with arrow, we use `int8` type when converting from a
+            # cudf boolean array.
+            if col.dtype.type == np.bool_:
+                types.append(pa.int8())
+            else:
+                types.append(np_to_pa_dtype(col.dtype))
+
+    # Indexes
+    materialize_index = False
+    if index is not False:
+        for level, name in enumerate(table.index.names):
+            if isinstance(table.index, cudf.MultiIndex):
+                idx = table.index.get_level_values(level)
+            else:
+                idx = table.index
+
+            if isinstance(idx, cudf.RangeIndex):
+                if index is None:
+                    descr: dict[str, Any] | Hashable = {
+                        "kind": "range",
+                        "name": table.index.name,
+                        "start": table.index.start,
+                        "stop": table.index.stop,
+                        "step": table.index.step,
+                    }
+                else:
+                    materialize_index = True
+                    # When `index=True`, RangeIndex needs to be materialized.
+                    materialized_idx = idx._as_int_index()
+                    descr = _index_level_name(
+                        index_name=materialized_idx.name,
+                        level=level,
+                        column_names=col_names,
+                    )
+                    index_levels.append(materialized_idx)
+                    columns_to_convert.append(materialized_idx._values)
+                    col_names.append(descr)
+                    types.append(np_to_pa_dtype(materialized_idx.dtype))
+            else:
+                descr = _index_level_name(
+                    index_name=idx.name, level=level, column_names=col_names
+                )
+                columns_to_convert.append(idx._values)
+                col_names.append(descr)
+                if isinstance(idx.dtype, cudf.CategoricalDtype):
+                    raise ValueError(
+                        "'category' column dtypes are currently not "
+                        + "supported by the gpu accelerated parquet writer"
+                    )
+                elif isinstance(idx.dtype, cudf.ListDtype):
+                    types.append(col.dtype.to_arrow())
+                else:
+                    # A boolean element takes 8 bits in cudf and 1 bit in
+                    # pyarrow. To make sure the cudf format is interperable
+                    # in arrow, we use `int8` type when converting from a
+                    # cudf boolean array.
+                    if idx.dtype.type == np.bool_:
+                        types.append(pa.int8())
+                    else:
+                        types.append(np_to_pa_dtype(idx.dtype))
+
+                index_levels.append(idx)
+            index_descriptors.append(descr)
+
+    df_meta = table.head(0)
+    if materialize_index:
+        df_meta.index = df_meta.index._as_int_index()
+    metadata = pa.pandas_compat.construct_metadata(
+        columns_to_convert=columns_to_convert,
+        # It is OKAY to do `.head(0).to_pandas()` because
+        # this method will extract `.columns` metadata only
+        df=df_meta.to_pandas(),
+        column_names=col_names,
+        index_levels=index_levels,
+        index_descriptors=index_descriptors,
+        preserve_index=index,
+        types=types,
+    )
+
+    md_dict = json.loads(metadata[b"pandas"])
+
+    # correct metadata for list and struct and nullable numeric types
+    for col_meta in md_dict["columns"]:
+        if (
+            col_meta["name"] in table._column_names
+            and table._data[col_meta["name"]].nullable
+            and col_meta["numpy_type"] in PARQUET_META_TYPE_MAP
+            and col_meta["pandas_type"] != "decimal"
+        ):
+            col_meta["numpy_type"] = PARQUET_META_TYPE_MAP[
+                col_meta["numpy_type"]
+            ]
+        if col_meta["numpy_type"] in ("list", "struct"):
+            col_meta["numpy_type"] = "object"
+
+    return json.dumps(md_dict)
+
+
 def is_url(url):
     """Check if a string is a valid URL to a network location.
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c83c1cbe895..0adaaa60654 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -341,6 +341,15 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
+def _is_null_host_scalar(slr) -> bool:
+    # slr is NA like or NaT like
+    return (
+        is_na_like(slr)
+        or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr))
+        or slr is pd.NaT
+    )
+
+
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index e726b7fdca1..3891110e9d3 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -76,13 +76,6 @@ files:
       - py_version
       - test_base
       - test_xgboost
-  test_catboost:
-    output: none
-    includes:
-      - cuda_version
-      - py_version
-      - test_base
-      - test_catboost
   test_cuml:
     output: none
     includes:
@@ -251,14 +244,6 @@ dependencies:
           - pip
           - pip:
             - xgboost>=2.0.1
-  test_catboost:
-    common:
-      - output_types: conda
-        packages:
-          - numpy
-          - scipy
-          - scikit-learn
-          - catboost
   test_cuml:
     common:
       - output_types: conda
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
deleted file mode 100644
index 04cc69231fe..00000000000
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_catboost.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import numpy as np
-import pandas as pd
-import pytest
-from catboost import CatBoostClassifier, CatBoostRegressor, Pool
-from sklearn.datasets import make_classification, make_regression
-
-rng = np.random.default_rng(seed=42)
-
-
-def assert_catboost_equal(expect, got, rtol=1e-7, atol=0.0):
-    if isinstance(expect, (tuple, list)):
-        assert len(expect) == len(got)
-        for e, g in zip(expect, got):
-            assert_catboost_equal(e, g, rtol, atol)
-    elif isinstance(expect, np.ndarray):
-        np.testing.assert_allclose(expect, got, rtol=rtol, atol=atol)
-    elif isinstance(expect, pd.DataFrame):
-        pd.testing.assert_frame_equal(expect, got)
-    elif isinstance(expect, pd.Series):
-        pd.testing.assert_series_equal(expect, got)
-    else:
-        assert expect == got
-
-
-pytestmark = pytest.mark.assert_eq(fn=assert_catboost_equal)
-
-
-@pytest.fixture
-def regression_data():
-    X, y = make_regression(n_samples=100, n_features=10, random_state=42)
-    return pd.DataFrame(X), pd.Series(y)
-
-
-@pytest.fixture
-def classification_data():
-    X, y = make_classification(
-        n_samples=100, n_features=10, n_classes=2, random_state=42
-    )
-    return pd.DataFrame(X), pd.Series(y)
-
-
-def test_catboost_regressor_with_dataframe(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_regressor_with_numpy(regression_data):
-    X, y = regression_data
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_classifier_with_dataframe(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_classifier_with_numpy(classification_data):
-    X, y = classification_data
-    model = CatBoostClassifier(iterations=10, verbose=0)
-    model.fit(X.values, y.values)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_pool_and_dataframe(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X, y)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X)
-    return predictions
-
-
-def test_catboost_with_pool_and_numpy(regression_data):
-    X, y = regression_data
-    train_pool = Pool(X.values, y.values)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(train_pool)
-    predictions = model.predict(X.values)
-    return predictions
-
-
-def test_catboost_with_categorical_features():
-    data = {
-        "numerical_feature": rng.standard_normal(100),
-        "categorical_feature": rng.choice(["A", "B", "C"], size=100),
-        "target": rng.integers(0, 2, size=100),
-    }
-    df = pd.DataFrame(data)
-    X = df[["numerical_feature", "categorical_feature"]]
-    y = df["target"]
-    cat_features = ["categorical_feature"]
-    model = CatBoostClassifier(
-        iterations=10, verbose=0, cat_features=cat_features
-    )
-    model.fit(X, y)
-    predictions = model.predict(X)
-    return predictions
-
-
-@pytest.mark.parametrize(
-    "X, y",
-    [
-        (
-            pd.DataFrame(rng.standard_normal((100, 5))),
-            pd.Series(rng.standard_normal(100)),
-        ),
-        (rng.standard_normal((100, 5)), rng.standard_normal(100)),
-    ],
-)
-def test_catboost_train_test_split(X, y):
-    from sklearn.model_selection import train_test_split
-
-    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
-    model = CatBoostRegressor(iterations=10, verbose=0)
-    model.fit(X_train, y_train)
-    predictions = model.predict(X_test)
-    return len(X_train), len(X_test), len(y_train), len(y_test), predictions
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
index bef02c86355..8be48953974 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
@@ -71,6 +71,9 @@ def test_holoviews_heatmap(df):
     )
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_holoviews_histogram(df):
     return get_plot_info(hv.Histogram(df.values))
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
index 1909392b9f7..c91808021e8 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -33,6 +33,9 @@ def assert_plots_equal(expect, got):
 pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_line():
     df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
     (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")
@@ -40,6 +43,9 @@ def test_line():
     return plt.gca()
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_bar():
     data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
     ax = data.plot(kind="bar")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
index 472f1889354..4d35d9e8946 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
@@ -37,6 +37,9 @@ def test_numpy_dot(df):
     return np.dot(df, df.T)
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_numpy_fft(sr):
     fft = np.fft.fft(sr)
     return fft
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
index ad287471aa0..7cea635afc4 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -116,6 +116,9 @@ def test_torch_train(data):
     return model(test_x1, test_x2)
 
 
+@pytest.mark.skip(
+    reason="AssertionError: The values for attribute 'device' do not match: cpu != cuda:0."
+)
 def test_torch_tensor_ctor():
     s = pd.Series(range(5))
     return torch.tensor(s.values)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
index 021c5bac9b7..f6a8a96ae3c 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -54,6 +54,9 @@ def test_scatter(df):
     return ax
 
 
+@pytest.mark.skip(
+    reason="AttributeError: 'ndarray' object has no attribute '_fsproxy_wrapped'"
+)
 def test_lineplot_with_sns_data():
     df = sns.load_dataset("flights")
     ax = sns.lineplot(data=df, x="month", y="passengers")
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
index 0777d982ac2..f275659288e 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -41,7 +41,7 @@ def test_multidimensional_distributed_timeseries(dask_client):
     rng = np.random.default_rng(seed=42)
     # Each row represents data from a different dimension while each column represents
     # data from the same dimension
-    your_time_series = rng.random(3, 1000)
+    your_time_series = rng.random((3, 1000))
     # Approximately, how many data points might be found in a pattern
     window_size = 50
 
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
index ba1f518cbfd..b4fad3024e7 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
@@ -271,6 +271,7 @@ def call(self, values):
         return tf.concat(values, axis=-1)
 
 
+@pytest.mark.xfail(reason="ValueError: Invalid dtype: object")
 def test_full_example_train_with_df(df, target):
     # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
     # Inputs are directly passed as dictionary of series
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
index 70f1e6a4250..ba98273404d 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
@@ -124,6 +124,11 @@ def test_predict(device: str) -> np.ndarray:
     predt0 = reg.predict(X_df)
 
     predt1 = booster.inplace_predict(X_df)
+    # After https://github.com/dmlc/xgboost/pull/11014, .inplace_predict()
+    # returns a real cupy array when called on a cudf.pandas proxy dataframe.
+    # So we need to ensure we have a valid numpy array.
+    if not isinstance(predt1, np.ndarray):
+        predt1 = predt1.get()
     np.testing.assert_allclose(predt0, predt1)
 
     predt2 = booster.predict(xgb.DMatrix(X_df))
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 80de9056a0a..c6a5887f85d 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -20,11 +20,11 @@ requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.8.5,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13,<0.0.18",
+    "numba-cuda>=0.2.0,<0.3.0",
     "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
@@ -98,8 +98,6 @@ filterwarnings = [
     "error",
     "ignore:::.*xdist.*",
     "ignore:::.*pytest.*",
-    # https://github.com/rapidsai/build-planning/issues/116
-    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
     # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
     "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore",
     # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 29fcd161444..a9d937435e9 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -60,8 +60,6 @@ addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error",
-  # https://github.com/rapidsai/build-planning/issues/116
-  "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
 ]
 xfail_strict = true
 
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 29d3dc4ae79..074096446fd 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -231,7 +231,8 @@ def validate_config_options(config: dict) -> None:
     executor = config.get("executor", "pylibcudf")
     if executor == "dask-experimental":
         unsupported = config.get("executor_options", {}).keys() - {
-            "max_rows_per_partition"
+            "max_rows_per_partition",
+            "parquet_blocksize",
         }
     else:
         unsupported = config.get("executor_options", {}).keys()
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index 624a9bd87ea..92f39abe71e 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -40,6 +40,7 @@ def __init__(
         self.dtype = dtype
         self.name = name
         self.options = options
+        self.is_pointwise = False
         self.children = children
         if name not in Agg._SUPPORTED:
             raise NotImplementedError(
@@ -68,7 +69,11 @@ def __init__(
             # TODO: handle nans
             req = plc.aggregation.variance(ddof=options)
         elif name == "count":
-            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+            req = plc.aggregation.count(
+                null_handling=plc.types.NullPolicy.EXCLUDE
+                if not options
+                else plc.types.NullPolicy.INCLUDE
+            )
         elif name == "quantile":
             _, quantile = self.children
             if not isinstance(quantile, Literal):
@@ -86,7 +91,7 @@ def __init__(
             op = partial(self._reduce, request=req)
         elif name in {"min", "max"}:
             op = partial(op, propagate_nans=options)
-        elif name in {"count", "first", "last"}:
+        elif name in {"count", "sum", "first", "last"}:
             pass
         else:
             raise NotImplementedError(
@@ -175,6 +180,18 @@ def _count(self, column: Column) -> Column:
             )
         )
 
+    def _sum(self, column: Column) -> Column:
+        if column.obj.size() == 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        return self._reduce(column, request=plc.aggregation.sum())
+
     def _min(self, column: Column, *, propagate_nans: bool) -> Column:
         if propagate_nans and column.nan_count > 0:
             return Column(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 4c7ae007070..8ba3f9f407c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -36,9 +36,11 @@ class ExecutionContext(IntEnum):
 class Expr(Node["Expr"]):
     """An abstract expression object."""
 
-    __slots__ = ("dtype",)
+    __slots__ = ("dtype", "is_pointwise")
     dtype: plc.DataType
     """Data type of the expression."""
+    is_pointwise: bool
+    """Whether this expression acts pointwise on its inputs."""
     # This annotation is needed because of https://github.com/python/mypy/issues/17981
     _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
     """Names of non-child data (not Exprs) for reconstruction."""
@@ -164,6 +166,7 @@ def __init__(self, dtype: plc.DataType, error: str) -> None:
         self.dtype = dtype
         self.error = error
         self.children = ()
+        self.is_pointwise = True
 
 
 class NamedExpr:
@@ -243,6 +246,7 @@ class Col(Expr):
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
         self.name = name
+        self.is_pointwise = True
         self.children = ()
 
     def do_evaluate(
@@ -280,6 +284,7 @@ def __init__(
         self.dtype = dtype
         self.index = index
         self.table_ref = table_ref
+        self.is_pointwise = True
         self.children = (column,)
 
     def do_evaluate(
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
index 245bdbefe88..556847b4738 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/binaryop.py
@@ -42,6 +42,7 @@ def __init__(
             op = BinOp._BOOL_KLEENE_MAPPING.get(op, op)
         self.op = op
         self.children = (left, right)
+        self.is_pointwise = True
         if not plc.binaryop.is_supported_operation(
             self.dtype, left.dtype, right.dtype, op
         ):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
index 5aa35ead127..d5ca22dd8d5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/boolean.py
@@ -81,6 +81,14 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = self.name not in (
+            BooleanFunction.Name.All,
+            BooleanFunction.Name.Any,
+            BooleanFunction.Name.IsDuplicated,
+            BooleanFunction.Name.IsFirstDistinct,
+            BooleanFunction.Name.IsLastDistinct,
+            BooleanFunction.Name.IsUnique,
+        )
         if self.name is BooleanFunction.Name.IsIn and not all(
             c.dtype == self.children[0].dtype for c in self.children
         ):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index c2dddfd9940..0c3159c73d6 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -114,6 +114,7 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = True
         if self.name not in self._COMPONENT_MAP:
             raise NotImplementedError(f"Temporal function {self.name}")
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
index 7eba0c110ab..8528e66c69c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/literal.py
@@ -38,6 +38,7 @@ def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
         self.children = ()
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
@@ -65,6 +66,7 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
         self.children = ()
+        self.is_pointwise = True
 
     def get_hashable(self) -> Hashable:
         """Compute a hash of the column."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
index 48c37d101f4..d4616d5d00a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/rolling.py
@@ -24,6 +24,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
         self.dtype = dtype
         self.options = options
         self.children = (agg,)
+        self.is_pointwise = False
         raise NotImplementedError("Rolling window not implemented")
 
 
@@ -35,4 +36,5 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> N
         self.dtype = dtype
         self.options = options
         self.children = (agg, *by)
+        self.is_pointwise = False
         raise NotImplementedError("Grouped rolling window not implemented")
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
index 12326740f74..93ecd026eaf 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/selection.py
@@ -30,6 +30,7 @@ class Gather(Expr):
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
         self.dtype = dtype
         self.children = (values, indices)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -71,6 +72,7 @@ class Filter(Expr):
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
         self.dtype = dtype
         self.children = (values, indices)
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
index 99512e2ef52..189f109e1a2 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/sorting.py
@@ -32,6 +32,7 @@ def __init__(
         self.dtype = dtype
         self.options = options
         self.children = (column,)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -71,6 +72,7 @@ def __init__(
         self.dtype = dtype
         self.options = options
         self.children = (column, *by)
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 124a6e8d71c..256840c1f3d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -106,6 +106,7 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self.is_pointwise = True
         self._validate_input()
 
     def _validate_input(self):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
index d2b5d6bae29..120ca8edce0 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/ternary.py
@@ -34,6 +34,7 @@ def __init__(
     ) -> None:
         self.dtype = dtype
         self.children = (when, then, otherwise)
+        self.is_pointwise = True
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
index 10caaff6811..3336c901e7f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/unary.py
@@ -33,6 +33,7 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         self.dtype = dtype
         self.children = (value,)
+        self.is_pointwise = True
         if not dtypes.can_cast(value.dtype, self.dtype):
             raise NotImplementedError(
                 f"Can't cast {value.dtype.id().name} to {self.dtype.id().name}"
@@ -63,6 +64,7 @@ class Len(Expr):
     def __init__(self, dtype: plc.DataType) -> None:
         self.dtype = dtype
         self.children = ()
+        self.is_pointwise = False
 
     def do_evaluate(
         self,
@@ -147,6 +149,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
+        self.is_pointwise = self.name not in (
+            "cum_min",
+            "cum_max",
+            "cum_prod",
+            "cum_sum",
+            "drop_nulls",
+            "unique",
+        )
 
         if self.name not in UnaryFunction._supported_fns:
             raise NotImplementedError(f"Unary function {name=}")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 1faa778ccf6..fd56329a48e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """
 DSL nodes for the LogicalPlan of polars.
@@ -34,9 +34,11 @@
 from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Hashable, MutableMapping, Sequence
+    from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
     from typing import Literal
 
+    from polars.polars import _expr_nodes as pl_expr
+
     from cudf_polars.typing import Schema
 
 
@@ -517,17 +519,22 @@ def do_evaluate(
         elif typ == "parquet":
             parquet_options = config_options.get("parquet_options", {})
             if parquet_options.get("chunked", True):
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                # We handle skip_rows != 0 by reading from the
+                # up to n_rows + skip_rows and slicing off the
+                # first skip_rows entries.
+                # TODO: Remove this workaround once
+                # https://github.com/rapidsai/cudf/issues/16186
+                # is fixed
+                nrows = n_rows + skip_rows
+                if nrows > -1:
+                    options.set_num_rows(nrows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
                 reader = plc.io.parquet.ChunkedParquetReader(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    # We handle skip_rows != 0 by reading from the
-                    # up to n_rows + skip_rows and slicing off the
-                    # first skip_rows entries.
-                    # TODO: Remove this workaround once
-                    # https://github.com/rapidsai/cudf/issues/16186
-                    # is fixed
-                    nrows=n_rows + skip_rows,
-                    skip_rows=0,
+                    options,
                     chunk_read_limit=parquet_options.get(
                         "chunk_read_limit", cls.PARQUET_DEFAULT_CHUNK_SIZE
                     ),
@@ -573,13 +580,18 @@ def slice_skip(tbl: plc.Table):
                 if predicate is not None and row_index is None:
                     # Can't apply filters during read if we have a row index.
                     filters = to_parquet_filter(predicate.value)
-                tbl_w_meta = plc.io.parquet.read_parquet(
-                    plc.io.SourceInfo(paths),
-                    columns=with_columns,
-                    filters=filters,
-                    nrows=n_rows,
-                    skip_rows=skip_rows,
-                )
+                options = plc.io.parquet.ParquetReaderOptions.builder(
+                    plc.io.SourceInfo(paths)
+                ).build()
+                if n_rows != -1:
+                    options.set_num_rows(n_rows)
+                if skip_rows != 0:
+                    options.set_skip_rows(skip_rows)
+                if with_columns is not None:
+                    options.set_columns(with_columns)
+                if filters is not None:
+                    options.set_filter(filters)
+                tbl_w_meta = plc.io.parquet.read_parquet(options)
                 df = DataFrame.from_table(
                     tbl_w_meta.tbl,
                     # TODO: consider nested column names?
@@ -594,10 +606,12 @@ def slice_skip(tbl: plc.Table):
                 (name, typ, []) for name, typ in schema.items()
             ]
             plc_tbl_w_meta = plc.io.json.read_json(
-                plc.io.SourceInfo(paths),
-                lines=True,
-                dtypes=json_schema,
-                prune_columns=True,
+                plc.io.json._setup_json_reader_options(
+                    plc.io.SourceInfo(paths),
+                    lines=True,
+                    dtypes=json_schema,
+                    prune_columns=True,
+                )
             )
             # TODO: I don't think cudf-polars supports nested types in general right now
             # (but when it does, we should pass child column names from nested columns in)
@@ -1007,7 +1021,27 @@ class ConditionalJoin(IR):
     __slots__ = ("ast_predicate", "options", "predicate")
     _non_child = ("schema", "predicate", "options")
     predicate: expr.Expr
-    options: tuple
+    """Expression predicate to join on"""
+    options: tuple[
+        tuple[
+            str,
+            pl_expr.Operator | Iterable[pl_expr.Operator],
+        ],
+        bool,
+        tuple[int, int] | None,
+        str,
+        bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
+    ]
+    """
+    tuple of options:
+    - predicates: tuple of ir join type (eg. ie_join) and (In)Equality conditions
+    - join_nulls: do nulls compare equal?
+    - slice: optional slice to perform after joining.
+    - suffix: string suffix for right columns if names match
+    - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
+    """
 
     def __init__(
         self, schema: Schema, predicate: expr.Expr, options: tuple, left: IR, right: IR
@@ -1017,15 +1051,16 @@ def __init__(
         self.options = options
         self.children = (left, right)
         self.ast_predicate = to_ast(predicate)
-        _, join_nulls, zlice, suffix, coalesce = self.options
+        _, join_nulls, zlice, suffix, coalesce, maintain_order = self.options
         # Preconditions from polars
         assert not join_nulls
         assert not coalesce
+        assert maintain_order == "none"
         if self.ast_predicate is None:
             raise NotImplementedError(
                 f"Conditional join with predicate {predicate}"
             )  # pragma: no cover; polars never delivers expressions we can't handle
-        self._non_child_args = (self.ast_predicate, zlice, suffix)
+        self._non_child_args = (self.ast_predicate, zlice, suffix, maintain_order)
 
     @classmethod
     def do_evaluate(
@@ -1033,6 +1068,7 @@ def do_evaluate(
         predicate: plc.expressions.Expression,
         zlice: tuple[int, int] | None,
         suffix: str,
+        maintain_order: Literal["none", "left", "right", "left_right", "right_left"],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
@@ -1076,6 +1112,7 @@ class Join(IR):
         tuple[int, int] | None,
         str,
         bool,
+        Literal["none", "left", "right", "left_right", "right_left"],
     ]
     """
     tuple of options:
@@ -1084,6 +1121,7 @@ class Join(IR):
     - slice: optional slice to perform after joining.
     - suffix: string suffix for right columns if names match
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    - maintain_order: which DataFrame row order to preserve, if any
     """
 
     def __init__(
@@ -1101,6 +1139,9 @@ def __init__(
         self.options = options
         self.children = (left, right)
         self._non_child_args = (self.left_on, self.right_on, self.options)
+        # TODO: Implement maintain_order
+        if options[5] != "none":
+            raise NotImplementedError("maintain_order not implemented yet")
         if any(
             isinstance(e.value, expr.Literal)
             for e in itertools.chain(self.left_on, self.right_on)
@@ -1210,12 +1251,13 @@ def do_evaluate(
             tuple[int, int] | None,
             str,
             bool,
+            Literal["none", "left", "right", "left_right", "right_left"],
         ],
         left: DataFrame,
         right: DataFrame,
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
-        how, join_nulls, zlice, suffix, coalesce = options
+        how, join_nulls, zlice, suffix, coalesce, _ = options
         if how == "cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 37cf36dc4dd..2138ac0c700 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Translate polars IR representation to ours."""
@@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (4, 0):
+        if (version := self.visitor.version()) >= (4, 3):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
diff --git a/python/cudf_polars/cudf_polars/dsl/traversal.py b/python/cudf_polars/cudf_polars/dsl/traversal.py
index b3248dae93c..9c45a68812a 100644
--- a/python/cudf_polars/cudf_polars/dsl/traversal.py
+++ b/python/cudf_polars/cudf_polars/dsl/traversal.py
@@ -10,7 +10,7 @@
 from cudf_polars.typing import U_contra, V_co
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Generator, Mapping, MutableMapping
+    from collections.abc import Callable, Generator, Mapping, MutableMapping, Sequence
 
     from cudf_polars.typing import GenericTransformer, NodeT
 
@@ -23,22 +23,22 @@
 ]
 
 
-def traversal(node: NodeT) -> Generator[NodeT, None, None]:
+def traversal(nodes: Sequence[NodeT]) -> Generator[NodeT, None, None]:
     """
     Pre-order traversal of nodes in an expression.
 
     Parameters
     ----------
-    node
-        Root of expression to traverse.
+    nodes
+        Roots of expressions to traverse.
 
     Yields
     ------
-    Unique nodes in the expression, parent before child, children
+    Unique nodes in the expressions, parent before child, children
     in-order from left to right.
     """
-    seen = {node}
-    lifo = [node]
+    seen = set(nodes)
+    lifo = list(nodes)
 
     while lifo:
         node = lifo.pop()
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
index 3a1fec36079..2a5b400af4c 100644
--- a/python/cudf_polars/cudf_polars/experimental/io.py
+++ b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -4,18 +4,24 @@
 
 from __future__ import annotations
 
+import enum
 import math
-from typing import TYPE_CHECKING
+import random
+from enum import IntEnum
+from typing import TYPE_CHECKING, Any
 
-from cudf_polars.dsl.ir import DataFrameScan, Union
+import pylibcudf as plc
+
+from cudf_polars.dsl.ir import IR, DataFrameScan, Scan, Union
 from cudf_polars.experimental.base import PartitionInfo
 from cudf_polars.experimental.dispatch import lower_ir_node
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
 
-    from cudf_polars.dsl.ir import IR
+    from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.experimental.dispatch import LowerIRTransformer
+    from cudf_polars.typing import Schema
 
 
 @lower_ir_node.register(DataFrameScan)
@@ -47,3 +53,274 @@ def _(
         }
 
     return ir, {ir: PartitionInfo(count=1)}
+
+
+class ScanPartitionFlavor(IntEnum):
+    """Flavor of Scan partitioning."""
+
+    SINGLE_FILE = enum.auto()  # 1:1 mapping between files and partitions
+    SPLIT_FILES = enum.auto()  # Split each file into >1 partition
+    FUSED_FILES = enum.auto()  # Fuse multiple files into each partition
+
+
+class ScanPartitionPlan:
+    """
+    Scan partitioning plan.
+
+    Notes
+    -----
+    The meaning of `factor` depends on the value of `flavor`:
+      - SINGLE_FILE: `factor` must be `1`.
+      - SPLIT_FILES: `factor` is the number of partitions per file.
+      - FUSED_FILES: `factor` is the number of files per partition.
+    """
+
+    __slots__ = ("factor", "flavor")
+    factor: int
+    flavor: ScanPartitionFlavor
+
+    def __init__(self, factor: int, flavor: ScanPartitionFlavor) -> None:
+        if (
+            flavor == ScanPartitionFlavor.SINGLE_FILE and factor != 1
+        ):  # pragma: no cover
+            raise ValueError(f"Expected factor == 1 for {flavor}, got: {factor}")
+        self.factor = factor
+        self.flavor = flavor
+
+    @staticmethod
+    def from_scan(ir: Scan) -> ScanPartitionPlan:
+        """Extract the partitioning plan of a Scan operation."""
+        if ir.typ == "parquet":
+            # TODO: Use system info to set default blocksize
+            parallel_options = ir.config_options.get("executor_options", {})
+            blocksize: int = parallel_options.get("parquet_blocksize", 1024**3)
+            stats = _sample_pq_statistics(ir)
+            file_size = sum(float(stats[column]) for column in ir.schema)
+            if file_size > 0:
+                if file_size > blocksize:
+                    # Split large files
+                    return ScanPartitionPlan(
+                        math.ceil(file_size / blocksize),
+                        ScanPartitionFlavor.SPLIT_FILES,
+                    )
+                else:
+                    # Fuse small files
+                    return ScanPartitionPlan(
+                        max(blocksize // int(file_size), 1),
+                        ScanPartitionFlavor.FUSED_FILES,
+                    )
+
+        # TODO: Use file sizes for csv and json
+        return ScanPartitionPlan(1, ScanPartitionFlavor.SINGLE_FILE)
+
+
+class SplitScan(IR):
+    """
+    Input from a split file.
+
+    This class wraps a single-file `Scan` object. At
+    IO/evaluation time, this class will only perform
+    a partial read of the underlying file. The range
+    (skip_rows and n_rows) is calculated at IO time.
+    """
+
+    __slots__ = (
+        "base_scan",
+        "schema",
+        "split_index",
+        "total_splits",
+    )
+    _non_child = (
+        "schema",
+        "base_scan",
+        "split_index",
+        "total_splits",
+    )
+    base_scan: Scan
+    """Scan operation this node is based on."""
+    split_index: int
+    """Index of the current split."""
+    total_splits: int
+    """Total number of splits."""
+
+    def __init__(
+        self, schema: Schema, base_scan: Scan, split_index: int, total_splits: int
+    ):
+        self.schema = schema
+        self.base_scan = base_scan
+        self.split_index = split_index
+        self.total_splits = total_splits
+        self._non_child_args = (
+            split_index,
+            total_splits,
+            *base_scan._non_child_args,
+        )
+        self.children = ()
+        if base_scan.typ not in ("parquet",):  # pragma: no cover
+            raise NotImplementedError(
+                f"Unhandled Scan type for file splitting: {base_scan.typ}"
+            )
+
+    @classmethod
+    def do_evaluate(
+        cls,
+        split_index: int,
+        total_splits: int,
+        schema: Schema,
+        typ: str,
+        reader_options: dict[str, Any],
+        config_options: dict[str, Any],
+        paths: list[str],
+        with_columns: list[str] | None,
+        skip_rows: int,
+        n_rows: int,
+        row_index: tuple[str, int] | None,
+        predicate: NamedExpr | None,
+    ):
+        """Evaluate and return a dataframe."""
+        if typ not in ("parquet",):  # pragma: no cover
+            raise NotImplementedError(f"Unhandled Scan type for file splitting: {typ}")
+
+        if len(paths) > 1:  # pragma: no cover
+            raise ValueError(f"Expected a single path, got: {paths}")
+
+        # Parquet logic:
+        # - We are one of "total_splits" SplitScan nodes
+        #   assigned to the same file.
+        # - We know our index within this file ("split_index")
+        # - We can also use parquet metadata to query the
+        #   total number of rows in each row-group of the file.
+        # - We can use all this information to calculate the
+        #   "skip_rows" and "n_rows" options to use locally.
+
+        rowgroup_metadata = plc.io.parquet_metadata.read_parquet_metadata(
+            plc.io.SourceInfo(paths)
+        ).rowgroup_metadata()
+        total_row_groups = len(rowgroup_metadata)
+        if total_splits <= total_row_groups:
+            # We have enough row-groups in the file to align
+            # all "total_splits" of our reads with row-group
+            # boundaries. Calculate which row-groups to include
+            # in the current read, and use metadata to translate
+            # the row-group indices to "skip_rows" and "n_rows".
+            rg_stride = total_row_groups // total_splits
+            skip_rgs = rg_stride * split_index
+            skip_rows = sum(rg["num_rows"] for rg in rowgroup_metadata[:skip_rgs])
+            n_rows = sum(
+                rg["num_rows"]
+                for rg in rowgroup_metadata[skip_rgs : skip_rgs + rg_stride]
+            )
+        else:
+            # There are not enough row-groups to align
+            # all "total_splits" of our reads with row-group
+            # boundaries. Use metadata to directly calculate
+            # "skip_rows" and "n_rows" for the current read.
+            total_rows = sum(rg["num_rows"] for rg in rowgroup_metadata)
+            n_rows = total_rows // total_splits
+            skip_rows = n_rows * split_index
+
+        # Last split should always read to end of file
+        if split_index == (total_splits - 1):
+            n_rows = -1
+
+        # Perform the partial read
+        return Scan.do_evaluate(
+            schema,
+            typ,
+            reader_options,
+            config_options,
+            paths,
+            with_columns,
+            skip_rows,
+            n_rows,
+            row_index,
+            predicate,
+        )
+
+
+def _sample_pq_statistics(ir: Scan) -> dict[str, float]:
+    import numpy as np
+    import pyarrow.dataset as pa_ds
+
+    # Use average total_uncompressed_size of three files
+    # TODO: Use plc.io.parquet_metadata.read_parquet_metadata
+    n_sample = 3
+    column_sizes = {}
+    ds = pa_ds.dataset(random.sample(ir.paths, n_sample), format="parquet")
+    for i, frag in enumerate(ds.get_fragments()):
+        md = frag.metadata
+        for rg in range(md.num_row_groups):
+            row_group = md.row_group(rg)
+            for col in range(row_group.num_columns):
+                column = row_group.column(col)
+                name = column.path_in_schema
+                if name not in column_sizes:
+                    column_sizes[name] = np.zeros(n_sample, dtype="int64")
+                column_sizes[name][i] += column.total_uncompressed_size
+
+    return {name: np.mean(sizes) for name, sizes in column_sizes.items()}
+
+
+@lower_ir_node.register(Scan)
+def _(
+    ir: Scan, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    partition_info: MutableMapping[IR, PartitionInfo]
+    if ir.typ in ("csv", "parquet", "ndjson") and ir.n_rows == -1 and ir.skip_rows == 0:
+        plan = ScanPartitionPlan.from_scan(ir)
+        paths = list(ir.paths)
+        if plan.flavor == ScanPartitionFlavor.SPLIT_FILES:
+            # Disable chunked reader when splitting files
+            config_options = ir.config_options.copy()
+            config_options["parquet_options"] = config_options.get(
+                "parquet_options", {}
+            ).copy()
+            config_options["parquet_options"]["chunked"] = False
+
+            slices: list[SplitScan] = []
+            for path in paths:
+                base_scan = Scan(
+                    ir.schema,
+                    ir.typ,
+                    ir.reader_options,
+                    ir.cloud_options,
+                    config_options,
+                    [path],
+                    ir.with_columns,
+                    ir.skip_rows,
+                    ir.n_rows,
+                    ir.row_index,
+                    ir.predicate,
+                )
+                slices.extend(
+                    SplitScan(ir.schema, base_scan, sindex, plan.factor)
+                    for sindex in range(plan.factor)
+                )
+            new_node = Union(ir.schema, None, *slices)
+            partition_info = {slice: PartitionInfo(count=1) for slice in slices} | {
+                new_node: PartitionInfo(count=len(slices))
+            }
+        else:
+            groups: list[Scan] = [
+                Scan(
+                    ir.schema,
+                    ir.typ,
+                    ir.reader_options,
+                    ir.cloud_options,
+                    ir.config_options,
+                    paths[i : i + plan.factor],
+                    ir.with_columns,
+                    ir.skip_rows,
+                    ir.n_rows,
+                    ir.row_index,
+                    ir.predicate,
+                )
+                for i in range(0, len(paths), plan.factor)
+            ]
+            new_node = Union(ir.schema, None, *groups)
+            partition_info = {group: PartitionInfo(count=1) for group in groups} | {
+                new_node: PartitionInfo(count=len(groups))
+            }
+        return new_node, partition_info
+
+    return ir, {ir: PartitionInfo(count=1)}  # pragma: no cover
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 51850adfbeb..16290fdb663 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 """Multi-partition Dask execution."""
 
@@ -11,8 +11,9 @@
 
 import cudf_polars.experimental.io
 import cudf_polars.experimental.join
+import cudf_polars.experimental.select
 import cudf_polars.experimental.shuffle  # noqa: F401
-from cudf_polars.dsl.ir import IR, Cache, Projection, Union
+from cudf_polars.dsl.ir import IR, Cache, Filter, HStack, Projection, Select, Union
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.experimental.base import PartitionInfo, _concat, get_key_name
 from cudf_polars.experimental.dispatch import (
@@ -114,7 +115,7 @@ def task_graph(
     """
     graph = reduce(
         operator.or_,
-        (generate_ir_tasks(node, partition_info) for node in traversal(ir)),
+        (generate_ir_tasks(node, partition_info) for node in traversal([ir])),
     )
 
     key_name = get_key_name(ir)
@@ -228,6 +229,8 @@ def _lower_ir_pwise(
 
 lower_ir_node.register(Projection, _lower_ir_pwise)
 lower_ir_node.register(Cache, _lower_ir_pwise)
+lower_ir_node.register(Filter, _lower_ir_pwise)
+lower_ir_node.register(HStack, _lower_ir_pwise)
 
 
 def _generate_ir_tasks_pwise(
@@ -247,3 +250,6 @@ def _generate_ir_tasks_pwise(
 
 generate_ir_tasks.register(Projection, _generate_ir_tasks_pwise)
 generate_ir_tasks.register(Cache, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Filter, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(HStack, _generate_ir_tasks_pwise)
+generate_ir_tasks.register(Select, _generate_ir_tasks_pwise)
diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py
new file mode 100644
index 00000000000..5f79384b569
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/experimental/select.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+"""Parallel Select Logic."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from cudf_polars.dsl.ir import Select
+from cudf_polars.dsl.traversal import traversal
+from cudf_polars.experimental.dispatch import lower_ir_node
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
+    from cudf_polars.experimental.parallel import LowerIRTransformer
+
+
+@lower_ir_node.register(Select)
+def _(
+    ir: Select, rec: LowerIRTransformer
+) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
+    child, partition_info = rec(ir.children[0])
+    pi = partition_info[child]
+    if pi.count > 1 and not all(
+        expr.is_pointwise for expr in traversal([e.value for e in ir.exprs])
+    ):
+        # TODO: Handle non-pointwise expressions.
+        raise NotImplementedError(
+            f"Selection {ir} does not support multiple partitions."
+        )
+    new_node = ir.reconstruct([child])
+    partition_info[new_node] = pi
+    return new_node, partition_info
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index 7a759eea2e9..c16df320ceb 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Plugin for running polars test suite setting GPU engine as default."""
@@ -123,6 +123,11 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[read_parquet-<lambda>]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-<lambda>0]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_write.py::test_write_async[<lambda>-<lambda>2]": "Need to add include_file_path to IR",
     "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[gpu]": "Expect this to pass because cudf-polars is installed",
     "tests/unit/lazyframe/test_engine_selection.py::test_engine_import_error_raises[engine1]": "Expect this to pass because cudf-polars is installed",
     "tests/unit/lazyframe/test_lazyframe.py::test_round[dtype1-123.55-1-123.6]": "Rounding midpoints is handled incorrectly",
@@ -140,6 +145,22 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func1-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func2-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/arithmetic/test_list_arithmetic.py::test_list_arithmetic_values[func1-func3-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_array.py::test_array_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_left-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_right-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_both-none]": "cudf-polars doesn't nullify division by zero",
+    "tests/unit/operations/arithmetic/test_list.py::test_list_arithmetic_values[exec_op_with_expr_no_type_coercion-broadcast_none-none]": "cudf-polars doesn't nullify division by zero",
     "tests/unit/operations/test_abs.py::test_abs_duration": "Need to raise for unsupported uops on timelike values",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input7-expected7-Float32-Float32]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_group_by.py::test_group_by_mean_by_dtype[input10-expected10-Date-output_dtype10]": "Unsupported groupby-agg for a particular dtype",
@@ -174,6 +195,19 @@ def pytest_configure(config: pytest.Config) -> None:
 }
 
 
+TESTS_TO_SKIP: Mapping[str, str] = {
+    # On Ubuntu 20.04, the tzdata package contains a bunch of symlinks
+    # for obsolete timezone names. However, the chrono_tz package that
+    # polars uses doesn't read /usr/share/zoneinfo, instead packaging
+    # the current zoneinfo database from IANA. Consequently, when this
+    # hypothesis-generated test runs and generates timezones from the
+    # available zoneinfo-reported timezones, we can get an error from
+    # polars that the requested timezone is unknown.
+    # Since this is random, just skip it, rather than xfailing.
+    "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+}
+
+
 def pytest_collection_modifyitems(
     session: pytest.Session, config: pytest.Config, items: list[pytest.Item]
 ) -> None:
@@ -182,5 +216,7 @@ def pytest_collection_modifyitems(
         # Don't xfail tests if running without fallback
         return
     for item in items:
-        if item.nodeid in EXPECTED_FAILURES:
+        if item.nodeid in TESTS_TO_SKIP:
+            item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid]))
+        elif item.nodeid in EXPECTED_FAILURES:
             item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index b781b13ec10..9fb9bbf391e 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.15",
+    "polars>=1.11,<1.18",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -66,8 +66,6 @@ addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error",
-  # https://github.com/rapidsai/build-planning/issues/116
-  "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
 ]
 xfail_strict = true
 
diff --git a/python/cudf_polars/tests/dsl/test_traversal.py b/python/cudf_polars/tests/dsl/test_traversal.py
index 9755994c419..9fcca2e290e 100644
--- a/python/cudf_polars/tests/dsl/test_traversal.py
+++ b/python/cudf_polars/tests/dsl/test_traversal.py
@@ -32,21 +32,21 @@ def test_traversal_unique():
     dt = plc.DataType(plc.TypeId.INT8)
 
     e1 = make_expr(dt, "a", "a")
-    unique_exprs = list(traversal(e1))
+    unique_exprs = list(traversal([e1]))
 
     assert len(unique_exprs) == 2
     assert set(unique_exprs) == {expr.Col(dt, "a"), e1}
     assert unique_exprs == [e1, expr.Col(dt, "a")]
 
     e2 = make_expr(dt, "a", "b")
-    unique_exprs = list(traversal(e2))
+    unique_exprs = list(traversal([e2]))
 
     assert len(unique_exprs) == 3
     assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e2}
     assert unique_exprs == [e2, expr.Col(dt, "a"), expr.Col(dt, "b")]
 
     e3 = make_expr(dt, "b", "a")
-    unique_exprs = list(traversal(e3))
+    unique_exprs = list(traversal([e3]))
 
     assert len(unique_exprs) == 3
     assert set(unique_exprs) == {expr.Col(dt, "a"), expr.Col(dt, "b"), e3}
diff --git a/python/cudf_polars/tests/experimental/test_scan.py b/python/cudf_polars/tests/experimental/test_scan.py
new file mode 100644
index 00000000000..a26d751dc86
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_scan.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import Translator
+from cudf_polars.experimental.parallel import lower_ir_graph
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.DataFrame(
+        {
+            "x": range(3_000),
+            "y": ["cat", "dog", "fish"] * 1_000,
+            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 600,
+        }
+    )
+
+
+def make_source(df, path, fmt, n_files=3):
+    n_rows = len(df)
+    stride = int(n_rows / n_files)
+    for i in range(n_files):
+        offset = stride * i
+        part = df.slice(offset, stride)
+        if fmt == "csv":
+            part.write_csv(path / f"part.{i}.csv")
+        elif fmt == "ndjson":
+            part.write_ndjson(path / f"part.{i}.ndjson")
+        else:
+            part.write_parquet(
+                path / f"part.{i}.parquet",
+                row_group_size=int(stride / 2),
+            )
+
+
+@pytest.mark.parametrize(
+    "fmt, scan_fn",
+    [
+        ("csv", pl.scan_csv),
+        ("ndjson", pl.scan_ndjson),
+        ("parquet", pl.scan_parquet),
+    ],
+)
+def test_parallel_scan(tmp_path, df, fmt, scan_fn):
+    make_source(df, tmp_path, fmt)
+    q = scan_fn(tmp_path)
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+    )
+    assert_gpu_result_equal(q, engine=engine)
+
+
+@pytest.mark.parametrize("blocksize", [1_000, 10_000, 1_000_000])
+def test_parquet_blocksize(tmp_path, df, blocksize):
+    n_files = 3
+    make_source(df, tmp_path, "parquet", n_files)
+    q = pl.scan_parquet(tmp_path)
+    engine = pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"parquet_blocksize": blocksize},
+    )
+    assert_gpu_result_equal(q, engine=engine)
+
+    # Check partitioning
+    qir = Translator(q._ldf.visit(), engine).translate_ir()
+    ir, info = lower_ir_graph(qir)
+    count = info[ir].count
+    if blocksize <= 12_000:
+        assert count > n_files
+    else:
+        assert count < n_files
diff --git a/python/cudf_polars/tests/experimental/test_select.py b/python/cudf_polars/tests/experimental/test_select.py
new file mode 100644
index 00000000000..7dfe6ead148
--- /dev/null
+++ b/python/cudf_polars/tests/experimental/test_select.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(scope="module")
+def engine():
+    return pl.GPUEngine(
+        raise_on_fail=True,
+        executor="dask-experimental",
+        executor_options={"max_rows_per_partition": 3},
+    )
+
+
+@pytest.fixture(scope="module")
+def df():
+    return pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    )
+
+
+def test_select(df, engine):
+    query = df.select(
+        pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d")
+    )
+    assert_gpu_result_equal(query, engine=engine)
+
+
+def test_select_reduce_raises(df, engine):
+    query = df.select(
+        (pl.col("a") + pl.col("b")).max(),
+        (pl.col("a") * 2 + pl.col("b")).alias("d").mean(),
+    )
+    with pytest.raises(
+        pl.exceptions.ComputeError,
+        match="NotImplementedError",
+    ):
+        assert_gpu_result_equal(query, engine=engine)
+
+
+def test_select_with_cse_no_agg(df, engine):
+    expr = pl.col("a") + pl.col("a")
+    query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c"))
+    assert_gpu_result_equal(query, engine=engine)
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 86cb2352dcc..15ad845ea78 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -148,3 +148,9 @@ def test_agg_singleton(op):
     q = df.select(op(pl.col("a")))
 
     assert_gpu_result_equal(q)
+
+
+def test_sum_empty_zero():
+    df = pl.LazyFrame({"a": pl.Series(values=[], dtype=pl.Int32())})
+    q = df.select(pl.col("a").sum())
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 1e8246496cd..53b96ba574b 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -213,3 +213,9 @@ def test_groupby_maintain_order_random(nrows, nkeys, with_nulls):
         )
     q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum())
     assert_gpu_result_equal(q)
+
+
+def test_groupby_len_with_nulls():
+    df = pl.DataFrame({"a": [1, 1, 1, 2], "b": [1, None, 2, 3]})
+    q = df.lazy().group_by("a").agg(pl.col("b").len())
+    assert_gpu_result_equal(q, check_row_order=False)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 2fcbbf21f1c..f1f47bfb9f1 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -53,6 +53,15 @@ def right():
     )
 
 
+@pytest.mark.parametrize(
+    "maintain_order", ["left", "left_right", "right_left", "right"]
+)
+def test_join_maintain_order_param_unsupported(left, right, maintain_order):
+    q = left.join(right, on=pl.col("a"), how="inner", maintain_order=maintain_order)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
 @pytest.mark.parametrize(
     "join_expr",
     [
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index ed43ab83d53..7820157d89b 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -93,8 +93,6 @@ addopts = "--tb=native --strict-config --strict-markers"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error",
-    # https://github.com/rapidsai/build-planning/issues/116
-    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
     "ignore:unclosed <socket.socket:ResourceWarning",
     "ignore:Port .* is already in use.:UserWarning:distributed",
     # Should be fixed in the next streamz release
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 20eb2404b77..863102103ed 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -1,7 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import warnings
-from importlib import import_module
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import dask.dataframe as dd
 from dask import config
@@ -9,11 +6,16 @@
 
 import cudf
 
-from . import backends  # noqa: F401
+from . import backends, io  # noqa: F401
+from ._expr.expr import _patch_dask_expr
 from ._version import __git_commit__, __version__  # noqa: F401
-from .core import DataFrame, Index, Series, concat, from_cudf
+from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf
 
-QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED
+if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()):
+    raise ValueError(
+        "The legacy DataFrame API is not supported in dask_cudf>24.12. "
+        "Please enable query-planning, or downgrade to dask_cudf<=24.12"
+    )
 
 
 def read_csv(*args, **kwargs):
@@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs):
         return dd.read_parquet(*args, **kwargs)
 
 
-def _deprecated_api(old_api, new_api=None, rec=None):
-    def inner_func(*args, **kwargs):
-        if new_api:
-            # Use alternative
-            msg = f"{old_api} is now deprecated. "
-            msg += rec or f"Please use {new_api} instead."
-            warnings.warn(msg, FutureWarning)
-            new_attr = new_api.split(".")
-            module = import_module(".".join(new_attr[:-1]))
-            return getattr(module, new_attr[-1])(*args, **kwargs)
-
-        # No alternative - raise an error
-        raise NotImplementedError(
-            f"{old_api} is no longer supported. " + (rec or "")
-        )
-
-    return inner_func
-
-
-if QUERY_PLANNING_ON:
-    from . import io
-    from ._expr.expr import _patch_dask_expr
-
-    groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
-    read_text = DataFrame.read_text
-    _patch_dask_expr()
-
-else:
-    from . import io  # noqa: F401
-    from ._legacy.groupby import groupby_agg  # noqa: F401
-    from ._legacy.io import read_text  # noqa: F401
-
-
+groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
+read_text = DataFrame.read_text
 to_orc = _deprecated_api(
     "dask_cudf.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.to_orc",
     rec="Please use DataFrame.to_orc instead.",
 )
 
 
+_patch_dask_expr()
+
+
 __all__ = [
     "DataFrame",
     "Index",
diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 2dc4031b876..e8c9a970b7b 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import warnings
 from functools import cached_property
@@ -15,19 +15,11 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.dataframe.dispatch import get_parallel_type
 from dask.typing import no_default
 
 import cudf
 
-_LEGACY_WORKAROUND = (
-    "To enable the 'legacy' dask-cudf API, set the "
-    "global 'dataframe.query-planning' config to "
-    "`False` before dask is imported. This can also "
-    "be done by setting an environment variable: "
-    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
-)
-
-
 ##
 ## Custom collection classes
 ##
@@ -103,9 +95,8 @@ def set_index(
             divisions = None
             warnings.warn(
                 "Ignoring divisions='quantile'. This option is now "
-                "deprecated. Please use the legacy API and raise an "
-                "issue on github if this feature is necessary."
-                f"\n{_LEGACY_WORKAROUND}",
+                "deprecated. Please raise an issue on github if this "
+                "feature is necessary.",
                 FutureWarning,
             )
 
@@ -135,9 +126,7 @@ def groupby(
 
             if kwargs.pop("as_index") is not True:
                 raise NotImplementedError(
-                    f"{msg} Please reset the index after aggregating, or "
-                    "use the legacy API if `as_index=False` is required.\n"
-                    f"{_LEGACY_WORKAROUND}"
+                    f"{msg} Please reset the index after aggregating."
                 )
             else:
                 warnings.warn(msg, FutureWarning)
@@ -153,15 +142,20 @@ def groupby(
         )
 
     def to_orc(self, *args, **kwargs):
-        from dask_cudf._legacy.io import to_orc
+        from dask_cudf.io.orc import to_orc as to_orc_impl
 
-        return to_orc(self, *args, **kwargs)
+        return to_orc_impl(self, *args, **kwargs)
 
     @staticmethod
     def read_text(*args, **kwargs):
-        from dask_cudf._legacy.io.text import read_text as legacy_read_text
+        from dask_cudf.io.text import read_text as read_text_impl
+
+        return read_text_impl(*args, **kwargs)
 
-        return legacy_read_text(*args, **kwargs)
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
 
 
 class Series(DXSeries, CudfFrameBase):
@@ -182,11 +176,23 @@ def struct(self):
 
         return StructMethods(self)
 
+    def clip(self, lower=None, upper=None, axis=1):
+        if axis not in (None, 1):
+            raise NotImplementedError("axis not yet supported in clip.")
+        return new_collection(self.expr.clip(lower, upper, 1))
+
 
 class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
+# dask.dataframe dispatch
+get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
+get_parallel_type.register(cudf.Series, lambda _: Series)
+get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+# dask_expr dispatch (might go away?)
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py
index 8b91e53604c..03d1da0d258 100644
--- a/python/dask_cudf/dask_cudf/_expr/expr.py
+++ b/python/dask_cudf/dask_cudf/_expr/expr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 import functools
 
 import dask_expr._shuffle as _shuffle_module
@@ -7,13 +7,13 @@
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
 
-from dask.dataframe.core import (
-    is_dataframe_like,
+from dask.dataframe.dispatch import (
+    is_categorical_dtype,
     make_meta,
     meta_nonempty,
 )
-from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
+from dask.utils import is_dataframe_like
 
 import cudf
 
diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py
index 0242fac6e72..a5cdd43169b 100644
--- a/python/dask_cudf/dask_cudf/_expr/groupby.py
+++ b/python/dask_cudf/dask_cudf/_expr/groupby.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 import functools
 
+import numpy as np
 import pandas as pd
 from dask_expr._collection import new_collection
 from dask_expr._groupby import (
@@ -16,11 +17,262 @@
 from dask.dataframe.groupby import Aggregation
 
 from cudf.core.groupby.groupby import _deprecate_collect
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 ##
 ## Fused groupby aggregations
 ##
 
+OPTIMIZED_AGGS = (
+    "count",
+    "mean",
+    "std",
+    "var",
+    "sum",
+    "min",
+    "max",
+    list,
+    "first",
+    "last",
+)
+
+
+def _make_name(col_name, sep="_"):
+    """Combine elements of `col_name` into a single string, or no-op if
+    `col_name` is already a string
+    """
+    if isinstance(col_name, str):
+        return col_name
+    return sep.join(name for name in col_name if name != "")
+
+
+@_dask_cudf_performance_tracking
+def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
+    """Initial partition-level aggregation task.
+
+    This is the first operation to be executed on each input
+    partition in `groupby_agg`.  Depending on `aggs`, four possible
+    groupby aggregations ("count", "sum", "min", and "max") are
+    performed.  The result is then partitioned (by hashing `gb_cols`)
+    into a number of distinct dictionary elements.  The number of
+    elements in the output dictionary (`split_out`) corresponds to
+    the number of partitions in the final output of `groupby_agg`.
+    """
+
+    # Modify dict for initial (partition-wise) aggregations
+    _agg_dict = {}
+    for col, agg_list in aggs.items():
+        _agg_dict[col] = set()
+        for agg in agg_list:
+            if agg in ("mean", "std", "var"):
+                _agg_dict[col].add("count")
+                _agg_dict[col].add("sum")
+            else:
+                _agg_dict[col].add(agg)
+        _agg_dict[col] = list(_agg_dict[col])
+        if set(agg_list).intersection({"std", "var"}):
+            pow2_name = _make_name((col, "pow2"), sep=sep)
+            df[pow2_name] = df[col].astype("float64").pow(2)
+            _agg_dict[pow2_name] = ["sum"]
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        _agg_dict
+    )
+    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _tree_node_agg(df, gb_cols, dropna, sort, sep):
+    """Node in groupby-aggregation reduction tree.
+
+    The input DataFrame (`df`) corresponds to the
+    concatenated output of one or more `_groupby_partition_agg`
+    tasks. In this function, "sum", "min" and/or "max" groupby
+    aggregations will be used to combine the statistics for
+    duplicate keys.
+    """
+
+    agg_dict = {}
+    for col in df.columns:
+        if col in gb_cols:
+            continue
+        agg = col.split(sep)[-1]
+        if agg in ("count", "sum"):
+            agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
+        elif agg in OPTIMIZED_AGGS:
+            agg_dict[col] = [agg]
+        else:
+            raise ValueError(f"Unexpected aggregation: {agg}")
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        agg_dict
+    )
+
+    # Don't include the last aggregation in the column names
+    output_columns = [
+        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
+        for name in gb.columns
+    ]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
+    """Calculate variance (given count, sum, and sum-squared columns)."""
+
+    # Select count, sum, and sum-squared
+    n = df[count_name]
+    x = df[sum_name]
+    x2 = df[pow2_sum_name]
+
+    # Use sum-squared approach to get variance
+    var = x2 - x**2 / n
+    div = n - ddof
+    div[div < 1] = 1  # Avoid division by 0
+    var /= div
+
+    # Set appropriate NaN elements
+    # (since we avoided 0-division)
+    var[(n - ddof) == 0] = np.nan
+
+    return var
+
+
+@_dask_cudf_performance_tracking
+def _finalize_gb_agg(
+    gb_in,
+    gb_cols,
+    aggs,
+    columns,
+    final_columns,
+    as_index,
+    dropna,
+    sort,
+    sep,
+    str_cols_out,
+    aggs_renames,
+):
+    """Final aggregation task.
+
+    This is the final operation on each output partitions
+    of the `groupby_agg` algorithm.  This function must
+    take care of higher-order aggregations, like "mean",
+    "std" and "var".  We also need to deal with the column
+    index, the row index, and final sorting behavior.
+    """
+
+    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
+
+    # Deal with higher-order aggregations
+    for col in columns:
+        agg_list = aggs.get(col, [])
+        agg_set = set(agg_list)
+        if agg_set.intersection({"mean", "std", "var"}):
+            count_name = _make_name((col, "count"), sep=sep)
+            sum_name = _make_name((col, "sum"), sep=sep)
+            if agg_set.intersection({"std", "var"}):
+                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
+                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
+                if "var" in agg_list:
+                    name_var = _make_name((col, "var"), sep=sep)
+                    gb[name_var] = var
+                if "std" in agg_list:
+                    name_std = _make_name((col, "std"), sep=sep)
+                    gb[name_std] = np.sqrt(var)
+                gb.drop(columns=[pow2_sum_name], inplace=True)
+            if "mean" in agg_list:
+                mean_name = _make_name((col, "mean"), sep=sep)
+                gb[mean_name] = gb[sum_name] / gb[count_name]
+            if "sum" not in agg_list:
+                gb.drop(columns=[sum_name], inplace=True)
+            if "count" not in agg_list:
+                gb.drop(columns=[count_name], inplace=True)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
+            gb[collect_name] = gb[collect_name].list.concat()
+
+    # Ensure sorted keys if `sort=True`
+    if sort:
+        gb = gb.sort_values(gb_cols)
+
+    # Set index if necessary
+    if as_index:
+        gb.set_index(gb_cols, inplace=True)
+
+    # Unflatten column names
+    col_array = []
+    agg_array = []
+    for col in gb.columns:
+        if col in gb_cols:
+            col_array.append(col)
+            agg_array.append("")
+        else:
+            name, agg = col.split(sep)
+            col_array.append(name)
+            agg_array.append(aggs_renames.get((name, agg), agg))
+    if str_cols_out:
+        gb.columns = col_array
+    else:
+        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+
+    return gb[final_columns]
+
+
+@_dask_cudf_performance_tracking
+def _redirect_aggs(arg):
+    """Redirect aggregations to their corresponding name in cuDF"""
+    redirects = {
+        sum: "sum",
+        max: "max",
+        min: "min",
+        "collect": list,
+        "list": list,
+    }
+    if isinstance(arg, dict):
+        new_arg = dict()
+        for col in arg:
+            if isinstance(arg[col], list):
+                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            elif isinstance(arg[col], dict):
+                new_arg[col] = {
+                    k: redirects.get(v, v) for k, v in arg[col].items()
+                }
+            else:
+                new_arg[col] = redirects.get(arg[col], arg[col])
+        return new_arg
+    if isinstance(arg, list):
+        return [redirects.get(agg, agg) for agg in arg]
+    return redirects.get(arg, arg)
+
+
+@_dask_cudf_performance_tracking
+def _aggs_optimized(arg, supported: set):
+    """Check that aggregations in `arg` are a subset of `supported`"""
+    if isinstance(arg, (list, dict)):
+        if isinstance(arg, dict):
+            _global_set: set[str] = set()
+            for col in arg:
+                if isinstance(arg[col], list):
+                    _global_set = _global_set.union(set(arg[col]))
+                elif isinstance(arg[col], dict):
+                    _global_set = _global_set.union(set(arg[col].values()))
+                else:
+                    _global_set.add(arg[col])
+        else:
+            _global_set = set(arg)
+
+        return bool(_global_set.issubset(supported))
+    elif isinstance(arg, (str, type)):
+        return arg in supported
+    return False
+
 
 def _get_spec_info(gb):
     if isinstance(gb.arg, (dict, list)):
@@ -105,20 +357,14 @@ def shuffle_by_index(self):
 
     @classmethod
     def chunk(cls, df, *by, **kwargs):
-        from dask_cudf._legacy.groupby import _groupby_partition_agg
-
         return _groupby_partition_agg(df, **kwargs)
 
     @classmethod
     def combine(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _tree_node_agg
-
         return _tree_node_agg(_concat(inputs), **kwargs)
 
     @classmethod
     def aggregate(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _finalize_gb_agg
-
         return _finalize_gb_agg(_concat(inputs), **kwargs)
 
     @property
@@ -193,12 +439,6 @@ def _maybe_get_custom_expr(
     shuffle_method=None,
     **kwargs,
 ):
-    from dask_cudf._legacy.groupby import (
-        OPTIMIZED_AGGS,
-        _aggs_optimized,
-        _redirect_aggs,
-    )
-
     if kwargs:
         # Unsupported key-word arguments
         return None
diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py
deleted file mode 100644
index d6beb775a5e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/core.py
+++ /dev/null
@@ -1,711 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import math
-import warnings
-
-import numpy as np
-import pandas as pd
-from tlz import partition_all
-
-from dask import dataframe as dd
-from dask.base import normalize_token, tokenize
-from dask.dataframe.core import (
-    Scalar,
-    handle_out,
-    make_meta as dask_make_meta,
-    map_partitions,
-)
-from dask.dataframe.utils import raise_on_meta_error
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname
-
-import cudf
-from cudf import _lib as libcudf
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._expr.accessors import ListMethods, StructMethods
-from dask_cudf._legacy import sorting
-from dask_cudf._legacy.sorting import (
-    _deprecate_shuffle_kwarg,
-    _get_shuffle_method,
-)
-
-
-class _Frame(dd.core._Frame, OperatorMethodMixin):
-    """Superclass for DataFrame and Series
-
-    Parameters
-    ----------
-    dsk : dict
-        The dask graph to compute this DataFrame
-    name : str
-        The key prefix that specifies which keys in the dask comprise this
-        particular DataFrame / Series
-    meta : cudf.DataFrame, cudf.Series, or cudf.Index
-        An empty cudf object with names, dtypes, and indices matching the
-        expected output.
-    divisions : tuple of index values
-        Values along which we partition our blocks on the index
-    """
-
-    def _is_partition_type(self, meta):
-        return isinstance(meta, self._partition_type)
-
-    def __repr__(self):
-        s = "<dask_cudf.%s | %d tasks | %d npartitions>"
-        return s % (type(self).__name__, len(self.dask), self.npartitions)
-
-
-normalize_token.register(_Frame, lambda a: a._name)
-
-
-class DataFrame(_Frame, dd.core.DataFrame):
-    """
-    A distributed Dask DataFrame where the backing dataframe is a
-    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
-
-    Typically you would not construct this object directly, but rather
-    use one of Dask-cuDF's IO routines.
-
-    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
-    supported, with many of the same caveats.
-
-    """
-
-    _partition_type = cudf.DataFrame
-
-    @_dask_cudf_performance_tracking
-    def _assign_column(self, k, v):
-        def assigner(df, k, v):
-            out = df.copy()
-            out[k] = v
-            return out
-
-        meta = assigner(self._meta, k, dask_make_meta(v))
-        return self.map_partitions(assigner, k, v, meta=meta)
-
-    @_dask_cudf_performance_tracking
-    def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
-        import uuid
-
-        if kwargs is None:
-            kwargs = {}
-
-        if cache_key is None:
-            cache_key = uuid.uuid4()
-
-        def do_apply_rows(df, func, incols, outcols, kwargs):
-            return df.apply_rows(
-                func, incols, outcols, kwargs, cache_key=cache_key
-            )
-
-        meta = do_apply_rows(self._meta, func, incols, outcols, kwargs)
-        return self.map_partitions(
-            do_apply_rows, func, incols, outcols, kwargs, meta=meta
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def merge(self, other, shuffle_method=None, **kwargs):
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().merge(
-            other,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def join(self, other, shuffle_method=None, **kwargs):
-        # CuDF doesn't support "right" join yet
-        how = kwargs.pop("how", "left")
-        if how == "right":
-            return other.join(other=self, how="left", **kwargs)
-
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().join(
-            other,
-            how=how,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def set_index(
-        self,
-        other,
-        sorted=False,
-        divisions=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        pre_sorted = sorted
-        del sorted
-
-        if divisions == "quantile":
-            warnings.warn(
-                "Using divisions='quantile' is now deprecated. "
-                "Please raise an issue on github if you believe "
-                "this feature is necessary.",
-                FutureWarning,
-            )
-
-        if (
-            divisions == "quantile"
-            or isinstance(divisions, (cudf.DataFrame, cudf.Series))
-            or (
-                isinstance(other, str)
-                and cudf.api.types.is_string_dtype(self[other].dtype)
-            )
-        ):
-            # Let upstream-dask handle "pre-sorted" case
-            if pre_sorted:
-                return dd.shuffle.set_sorted_index(
-                    self, other, divisions=divisions, **kwargs
-                )
-
-            by = other
-            if not isinstance(other, list):
-                by = [by]
-            if len(by) > 1:
-                raise ValueError("Dask does not support MultiIndex (yet).")
-            if divisions == "quantile":
-                divisions = None
-
-            # Use dask_cudf's sort_values
-            df = self.sort_values(
-                by,
-                max_branch=kwargs.get("max_branch", None),
-                divisions=divisions,
-                set_divisions=True,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-
-            # Ignore divisions if its a dataframe
-            if isinstance(divisions, cudf.DataFrame):
-                divisions = None
-
-            # Set index and repartition
-            df2 = df.map_partitions(
-                sorting.set_index_post,
-                index_name=other,
-                drop=kwargs.get("drop", True),
-                column_dtype=df.columns.dtype,
-            )
-            npartitions = kwargs.get("npartitions", self.npartitions)
-            partition_size = kwargs.get("partition_size", None)
-            if partition_size:
-                return df2.repartition(partition_size=partition_size)
-            if not divisions and df2.npartitions != npartitions:
-                return df2.repartition(npartitions=npartitions)
-            if divisions and df2.npartitions != len(divisions) - 1:
-                return df2.repartition(divisions=divisions)
-            return df2
-
-        return super().set_index(
-            other,
-            sorted=pre_sorted,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            divisions=divisions,
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def sort_values(
-        self,
-        by,
-        ignore_index=False,
-        max_branch=None,
-        divisions=None,
-        set_divisions=False,
-        ascending=True,
-        na_position="last",
-        sort_function=None,
-        sort_function_kwargs=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        if kwargs:
-            raise ValueError(
-                f"Unsupported input arguments passed : {list(kwargs.keys())}"
-            )
-
-        df = sorting.sort_values(
-            self,
-            by,
-            max_branch=max_branch,
-            divisions=divisions,
-            set_divisions=set_divisions,
-            ignore_index=ignore_index,
-            ascending=ascending,
-            na_position=na_position,
-            shuffle_method=shuffle_method,
-            sort_function=sort_function,
-            sort_function_kwargs=sort_function_kwargs,
-        )
-
-        if ignore_index:
-            return df.reset_index(drop=True)
-        return df
-
-    @_dask_cudf_performance_tracking
-    def to_parquet(self, path, *args, **kwargs):
-        """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
-        from dask_cudf._legacy.io import to_parquet
-
-        return to_parquet(self, path, *args, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def to_orc(self, path, **kwargs):
-        """Calls dask_cudf._legacy.io.to_orc"""
-        from dask_cudf._legacy.io import to_orc
-
-        return to_orc(self, path, **kwargs)
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-        numeric_only=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(
-            axis=axis, skipna=skipna, numeric_only=numeric_only
-        )
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-                numeric_only=numeric_only,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def shuffle(self, *args, shuffle_method=None, **kwargs):
-        """Wraps dask.dataframe DataFrame.shuffle method"""
-        return super().shuffle(
-            *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
-        )
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, by=None, **kwargs):
-        from .groupby import CudfDataFrameGroupBy
-
-        return CudfDataFrameGroupBy(self, by=by, **kwargs)
-
-
-@_dask_cudf_performance_tracking
-def sum_of_squares(x):
-    x = x.astype("f8")._column
-    outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series._from_column(outcol)
-
-
-@_dask_cudf_performance_tracking
-def var_aggregate(x2, x, n, ddof):
-    try:
-        with warnings.catch_warnings(record=True):
-            warnings.simplefilter("always")
-            result = (x2 / n) - (x / n) ** 2
-        if ddof != 0:
-            result = result * n / (n - ddof)
-        return result
-    except ZeroDivisionError:
-        return np.float64(np.nan)
-
-
-@_dask_cudf_performance_tracking
-def nlargest_agg(x, **kwargs):
-    return cudf.concat(x).nlargest(**kwargs)
-
-
-@_dask_cudf_performance_tracking
-def nsmallest_agg(x, **kwargs):
-    return cudf.concat(x).nsmallest(**kwargs)
-
-
-class Series(_Frame, dd.core.Series):
-    _partition_type = cudf.Series
-
-    @_dask_cudf_performance_tracking
-    def count(self, split_every=False):
-        return reduction(
-            [self],
-            chunk=M.count,
-            aggregate=np.sum,
-            split_every=split_every,
-            meta="i8",
-        )
-
-    @_dask_cudf_performance_tracking
-    def mean(self, split_every=False):
-        sum = self.sum(split_every=split_every)
-        n = self.count(split_every=split_every)
-        return sum / n
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, *args, **kwargs):
-        from .groupby import CudfSeriesGroupBy
-
-        return CudfSeriesGroupBy(self, *args, **kwargs)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def list(self):
-        return ListMethods(self)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def struct(self):
-        return StructMethods(self)
-
-
-class Index(Series, dd.core.Index):
-    _partition_type = cudf.Index  # type: ignore
-
-
-@_dask_cudf_performance_tracking
-def _naive_var(ddf, meta, skipna, ddof, split_every, out):
-    num = ddf._get_numeric_data()
-    x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
-    x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every)
-    n = num.count(split_every=split_every)
-    name = ddf._token_prefix + "var"
-    result = map_partitions(
-        var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof
-    )
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _parallel_var(ddf, meta, skipna, split_every, out):
-    def _local_var(x, skipna):
-        if skipna:
-            n = x.count()
-            avg = x.mean(skipna=skipna)
-        else:
-            # Not skipping nulls, so might as well
-            # avoid the full `count` operation
-            n = len(x)
-            avg = x.sum(skipna=skipna) / n
-        m2 = ((x - avg) ** 2).sum(skipna=skipna)
-        return n, avg, m2
-
-    def _aggregate_var(parts):
-        n, avg, m2 = parts[0]
-        for i in range(1, len(parts)):
-            n_a, avg_a, m2_a = n, avg, m2
-            n_b, avg_b, m2_b = parts[i]
-            n = n_a + n_b
-            avg = (n_a * avg_a + n_b * avg_b) / n
-            delta = avg_b - avg_a
-            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
-        return n, avg, m2
-
-    def _finalize_var(vals):
-        n, _, m2 = vals
-        return m2 / (n - 1)
-
-    # Build graph
-    nparts = ddf.npartitions
-    if not split_every:
-        split_every = nparts
-    name = "var-" + tokenize(skipna, split_every, out)
-    local_name = "local-" + name
-    num = ddf._get_numeric_data()
-    dsk = {
-        (local_name, n, 0): (_local_var, (num._name, n), skipna)
-        for n in range(nparts)
-    }
-
-    # Use reduction tree
-    widths = [nparts]
-    while nparts > 1:
-        nparts = math.ceil(nparts / split_every)
-        widths.append(nparts)
-    height = len(widths)
-    for depth in range(1, height):
-        for group in range(widths[depth]):
-            p_max = widths[depth - 1]
-            lstart = split_every * group
-            lstop = min(lstart + split_every, p_max)
-            node_list = [
-                (local_name, p, depth - 1) for p in range(lstart, lstop)
-            ]
-            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
-    if height == 1:
-        group = depth = 0
-    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))
-
-    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
-    result = dd.core.new_dd_object(graph, name, meta, (None, None))
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _extract_meta(x):
-    """
-    Extract internal cache data (``_meta``) from dask_cudf objects
-    """
-    if isinstance(x, (Scalar, _Frame)):
-        return x._meta
-    elif isinstance(x, list):
-        return [_extract_meta(_x) for _x in x]
-    elif isinstance(x, tuple):
-        return tuple(_extract_meta(_x) for _x in x)
-    elif isinstance(x, dict):
-        return {k: _extract_meta(v) for k, v in x.items()}
-    return x
-
-
-@_dask_cudf_performance_tracking
-def _emulate(func, *args, **kwargs):
-    """
-    Apply a function using args / kwargs. If arguments contain dd.DataFrame /
-    dd.Series, using internal cache (``_meta``) for calculation
-    """
-    with raise_on_meta_error(funcname(func)):
-        return func(*_extract_meta(args), **_extract_meta(kwargs))
-
-
-@_dask_cudf_performance_tracking
-def align_partitions(args):
-    """Align partitions between dask_cudf objects.
-
-    Note that if all divisions are unknown, but have equal npartitions, then
-    they will be passed through unchanged.
-    """
-    dfs = [df for df in args if isinstance(df, _Frame)]
-    if not dfs:
-        return args
-
-    divisions = dfs[0].divisions
-    if not all(df.divisions == divisions for df in dfs):
-        raise NotImplementedError("Aligning mismatched partitions")
-    return args
-
-
-@_dask_cudf_performance_tracking
-def reduction(
-    args,
-    chunk=None,
-    aggregate=None,
-    combine=None,
-    meta=None,
-    token=None,
-    chunk_kwargs=None,
-    aggregate_kwargs=None,
-    combine_kwargs=None,
-    split_every=None,
-    **kwargs,
-):
-    """Generic tree reduction operation.
-
-    Parameters
-    ----------
-    args :
-        Positional arguments for the `chunk` function. All `dask.dataframe`
-        objects should be partitioned and indexed equivalently.
-    chunk : function [block-per-arg] -> block
-        Function to operate on each block of data
-    aggregate : function list-of-blocks -> block
-        Function to operate on the list of results of chunk
-    combine : function list-of-blocks -> block, optional
-        Function to operate on intermediate lists of results of chunk
-        in a tree-reduction. If not provided, defaults to aggregate.
-    $META
-    token : str, optional
-        The name to use for the output keys.
-    chunk_kwargs : dict, optional
-        Keywords for the chunk function only.
-    aggregate_kwargs : dict, optional
-        Keywords for the aggregate function only.
-    combine_kwargs : dict, optional
-        Keywords for the combine function only.
-    split_every : int, optional
-        Group partitions into groups of this size while performing a
-        tree-reduction. If set to False, no tree-reduction will be used,
-        and all intermediates will be concatenated and passed to ``aggregate``.
-        Default is 8.
-    kwargs :
-        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
-        ``combine``.
-    """
-    if chunk_kwargs is None:
-        chunk_kwargs = dict()
-    if aggregate_kwargs is None:
-        aggregate_kwargs = dict()
-    chunk_kwargs.update(kwargs)
-    aggregate_kwargs.update(kwargs)
-
-    if combine is None:
-        if combine_kwargs:
-            raise ValueError("`combine_kwargs` provided with no `combine`")
-        combine = aggregate
-        combine_kwargs = aggregate_kwargs
-    else:
-        if combine_kwargs is None:
-            combine_kwargs = dict()
-        combine_kwargs.update(kwargs)
-
-    if not isinstance(args, (tuple, list)):
-        args = [args]
-
-    npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)}
-    if len(npartitions) > 1:
-        raise ValueError("All arguments must have same number of partitions")
-    npartitions = npartitions.pop()
-
-    if split_every is None:
-        split_every = 8
-    elif split_every is False:
-        split_every = npartitions
-    elif split_every < 2 or not isinstance(split_every, int):
-        raise ValueError("split_every must be an integer >= 2")
-
-    token_key = tokenize(
-        token or (chunk, aggregate),
-        meta,
-        args,
-        chunk_kwargs,
-        aggregate_kwargs,
-        combine_kwargs,
-        split_every,
-    )
-
-    # Chunk
-    a = f"{token or funcname(chunk)}-chunk-{token_key}"
-    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
-        dsk = {
-            (a, 0, i): (chunk, key)
-            for i, key in enumerate(args[0].__dask_keys__())
-        }
-    else:
-        dsk = {
-            (a, 0, i): (
-                apply,
-                chunk,
-                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
-                chunk_kwargs,
-            )
-            for i in range(args[0].npartitions)
-        }
-
-    # Combine
-    b = f"{token or funcname(combine)}-combine-{token_key}"
-    k = npartitions
-    depth = 0
-    while k > split_every:
-        for part_i, inds in enumerate(partition_all(split_every, range(k))):
-            conc = (list, [(a, depth, i) for i in inds])
-            dsk[(b, depth + 1, part_i)] = (
-                (apply, combine, [conc], combine_kwargs)
-                if combine_kwargs
-                else (combine, conc)
-            )
-        k = part_i + 1
-        a = b
-        depth += 1
-
-    # Aggregate
-    b = f"{token or funcname(aggregate)}-agg-{token_key}"
-    conc = (list, [(a, depth, i) for i in range(k)])
-    if aggregate_kwargs:
-        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
-    else:
-        dsk[(b, 0)] = (aggregate, conc)
-
-    if meta is None:
-        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
-        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
-    meta = dask_make_meta(meta)
-
-    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
-    return dd.core.new_dd_object(graph, b, meta, (None, None))
-
-
-for name in (
-    "add",
-    "sub",
-    "mul",
-    "truediv",
-    "floordiv",
-    "mod",
-    "pow",
-    "radd",
-    "rsub",
-    "rmul",
-    "rtruediv",
-    "rfloordiv",
-    "rmod",
-    "rpow",
-):
-    meth = getattr(cudf.DataFrame, name)
-    DataFrame._bind_operator_method(name, meth, original=cudf.Series)
-
-    meth = getattr(cudf.Series, name)
-    Series._bind_operator_method(name, meth, original=cudf.Series)
-
-for name in ("lt", "gt", "le", "ge", "ne", "eq"):
-    meth = getattr(cudf.Series, name)
-    Series._bind_comparison_method(name, meth, original=cudf.Series)
diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py
deleted file mode 100644
index 7e01e91476d..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/groupby.py
+++ /dev/null
@@ -1,909 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from __future__ import annotations
-
-from functools import wraps
-
-import numpy as np
-import pandas as pd
-
-from dask.dataframe.core import (
-    DataFrame as DaskDataFrame,
-    aca,
-    split_out_on_cols,
-)
-from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy
-from dask.utils import funcname
-
-import cudf
-from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg
-
-# aggregations that are dask-cudf optimized
-OPTIMIZED_AGGS = (
-    "count",
-    "mean",
-    "std",
-    "var",
-    "sum",
-    "min",
-    "max",
-    list,
-    "first",
-    "last",
-)
-
-
-def _check_groupby_optimized(func):
-    """
-    Decorator for dask-cudf's groupby methods that returns the dask-cudf
-    optimized method if the groupby object is supported, otherwise
-    reverting to the upstream Dask method
-    """
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        gb = args[0]
-        if _groupby_optimized(gb):
-            return func(*args, **kwargs)
-        # note that we use upstream Dask's default kwargs for this call if
-        # none are specified; this shouldn't be an issue as those defaults are
-        # consistent with dask-cudf
-        return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs)
-
-    return wrapper
-
-
-class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def __getitem__(self, key):
-        if isinstance(key, list):
-            g = CudfDataFrameGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-        else:
-            g = CudfSeriesGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-
-        g._meta = g._meta[key]
-        return g
-
-    @_dask_cudf_performance_tracking
-    def _make_groupby_method_aggs(self, agg_name):
-        """Create aggs dictionary for aggregation methods"""
-
-        if isinstance(self.by, list):
-            return {c: agg_name for c in self.obj.columns if c not in self.by}
-        return {c: agg_name for c in self.obj.columns if c != self.by}
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("count"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("mean"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("std"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("var"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("sum"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("min"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("max"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs(list),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("first"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("last"),
-            split_every,
-            split_out,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
-                keys = self._meta.grouping.keys.names
-            else:
-                keys = self._meta.grouping.keys.name
-
-            return groupby_agg(
-                self.obj,
-                keys,
-                arg,
-                split_every=split_every,
-                split_out=split_out,
-                sep=self.sep,
-                sort=self.sort,
-                as_index=self.as_index,
-                shuffle_method=shuffle_method,
-                **self.dropna,
-            )
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "count"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "mean"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "std"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "var"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "sum"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "min"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "max"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: list},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "first"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "last"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if not isinstance(arg, dict):
-            arg = {self._slice: arg}
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            return _make_groupby_agg_call(
-                self, arg, split_every, split_out, shuffle_method
-            )[self._slice]
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-def _shuffle_aggregate(
-    ddf,
-    gb_cols,
-    chunk,
-    chunk_kwargs,
-    aggregate,
-    aggregate_kwargs,
-    split_every,
-    split_out,
-    token=None,
-    sort=None,
-    shuffle_method=None,
-):
-    # Shuffle-based groupby aggregation
-    # NOTE: This function is the dask_cudf version of
-    # dask.dataframe.groupby._shuffle_aggregate
-
-    # Step 1 - Chunkwise groupby operation
-    chunk_name = f"{token or funcname(chunk)}-chunk"
-    chunked = ddf.map_partitions(
-        chunk,
-        meta=chunk(ddf._meta, **chunk_kwargs),
-        token=chunk_name,
-        **chunk_kwargs,
-    )
-
-    # Step 2 - Perform global sort or shuffle
-    shuffle_npartitions = max(
-        chunked.npartitions // split_every,
-        split_out,
-    )
-    if sort and split_out > 1:
-        # Sort-based code path
-        result = (
-            chunked.repartition(npartitions=shuffle_npartitions)
-            .sort_values(
-                gb_cols,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-            .map_partitions(
-                aggregate,
-                meta=aggregate(chunked._meta, **aggregate_kwargs),
-                **aggregate_kwargs,
-            )
-        )
-    else:
-        # Hash-based code path
-        result = chunked.shuffle(
-            gb_cols,
-            npartitions=shuffle_npartitions,
-            ignore_index=True,
-            shuffle_method=shuffle_method,
-        ).map_partitions(
-            aggregate,
-            meta=aggregate(chunked._meta, **aggregate_kwargs),
-            **aggregate_kwargs,
-        )
-
-    # Step 3 - Repartition and return
-    if split_out < result.npartitions:
-        return result.repartition(npartitions=split_out)
-    return result
-
-
-@_dask_cudf_performance_tracking
-def groupby_agg(
-    ddf,
-    gb_cols,
-    aggs_in,
-    split_every=None,
-    split_out=None,
-    dropna=True,
-    sep="___",
-    sort=False,
-    as_index=True,
-    shuffle_method=None,
-):
-    """Optimized groupby aggregation for Dask-CuDF.
-
-    Parameters
-    ----------
-    ddf : DataFrame
-        DataFrame object to perform grouping on.
-    gb_cols : str or list[str]
-        Column names to group by.
-    aggs_in : str, list, or dict
-        Aggregations to perform.
-    split_every : int (optional)
-        How to group intermediate aggregates.
-    dropna : bool
-        Drop grouping key values corresponding to NA values.
-    as_index : bool
-        Currently ignored.
-    sort : bool
-        Sort the group keys, better performance is obtained when
-        not sorting.
-    shuffle_method : str (optional)
-        Control how shuffling of the DataFrame is performed.
-    sep : str
-        Internal usage.
-
-
-    Notes
-    -----
-    This "optimized" approach is more performant than the algorithm in
-    implemented in :meth:`DataFrame.apply` because it allows the cuDF
-    backend to perform multiple aggregations at once.
-
-    This aggregation algorithm only supports the following options
-
-    * "list"
-    * "count"
-    * "first"
-    * "last"
-    * "max"
-    * "mean"
-    * "min"
-    * "std"
-    * "sum"
-    * "var"
-
-
-    See Also
-    --------
-    DataFrame.groupby : generic groupby of a DataFrame
-    dask.dataframe.apply_concat_apply : for more description of the
-        split_every argument.
-
-    """
-    # Assert that aggregations are supported
-    aggs = _redirect_aggs(aggs_in)
-    if not _aggs_optimized(aggs, OPTIMIZED_AGGS):
-        raise ValueError(
-            f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. "
-            f"Aggregations must be specified with dict or list syntax."
-        )
-
-    # If split_every is False, we use an all-to-one reduction
-    if split_every is False:
-        split_every = max(ddf.npartitions, 2)
-
-    # Deal with default split_out and split_every params
-    split_every = split_every or 8
-    split_out = split_out or 1
-
-    # Standardize `gb_cols`, `columns`, and `aggs`
-    if isinstance(gb_cols, str):
-        gb_cols = [gb_cols]
-    columns = [c for c in ddf.columns if c not in gb_cols]
-    if not isinstance(aggs, dict):
-        aggs = {col: aggs for col in columns}
-
-    # Assert if our output will have a MultiIndex; this will be the case if
-    # any value in the `aggs` dict is not a string (i.e. multiple/named
-    # aggregations per column)
-    str_cols_out = True
-    aggs_renames = {}
-    for col in aggs:
-        if isinstance(aggs[col], str) or callable(aggs[col]):
-            aggs[col] = [aggs[col]]
-        elif isinstance(aggs[col], dict):
-            str_cols_out = False
-            col_aggs = []
-            for k, v in aggs[col].items():
-                aggs_renames[col, v] = k
-                col_aggs.append(v)
-            aggs[col] = col_aggs
-        else:
-            str_cols_out = False
-        if col in gb_cols:
-            columns.append(col)
-
-    # Construct meta
-    _aggs = aggs.copy()
-    if str_cols_out:
-        # Metadata should use `str` for dict values if that is
-        # what the user originally specified (column names will
-        # be str, rather than tuples).
-        for col in aggs:
-            _aggs[col] = _aggs[col][0]
-    _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
-    if aggs_renames:
-        col_array = []
-        agg_array = []
-        for col, agg in _meta.columns:
-            col_array.append(col)
-            agg_array.append(aggs_renames.get((col, agg), agg))
-        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    chunk = _groupby_partition_agg
-    chunk_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    combine = _tree_node_agg
-    combine_kwargs = {
-        "gb_cols": gb_cols,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    aggregate = _finalize_gb_agg
-    aggregate_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "final_columns": _meta.columns,
-        "as_index": as_index,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-        "str_cols_out": str_cols_out,
-        "aggs_renames": aggs_renames,
-    }
-
-    # Use shuffle_method=True for split_out>1
-    if sort and split_out > 1 and shuffle_method is None:
-        shuffle_method = "tasks"
-
-    # Check if we are using the shuffle-based algorithm
-    if shuffle_method:
-        # Shuffle-based aggregation
-        return _shuffle_aggregate(
-            ddf,
-            gb_cols,
-            chunk,
-            chunk_kwargs,
-            aggregate,
-            aggregate_kwargs,
-            split_every,
-            split_out,
-            token="cudf-aggregate",
-            sort=sort,
-            shuffle_method=shuffle_method
-            if isinstance(shuffle_method, str)
-            else None,
-        )
-
-    # Deal with sort/shuffle defaults
-    if split_out > 1 and sort:
-        raise ValueError(
-            "dask-cudf's groupby algorithm does not yet support "
-            "`sort=True` when `split_out>1`, unless a shuffle-based "
-            "algorithm is used. Please use `split_out=1`, group "
-            "with `sort=False`, or set `shuffle_method=True`."
-        )
-
-    # Determine required columns to enable column projection
-    required_columns = list(
-        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
-    )
-
-    return aca(
-        [ddf[required_columns]],
-        chunk=chunk,
-        chunk_kwargs=chunk_kwargs,
-        combine=combine,
-        combine_kwargs=combine_kwargs,
-        aggregate=aggregate,
-        aggregate_kwargs=aggregate_kwargs,
-        token="cudf-aggregate",
-        split_every=split_every,
-        split_out=split_out,
-        split_out_setup=split_out_on_cols,
-        split_out_setup_kwargs={"cols": gb_cols},
-        sort=sort,
-        ignore_index=True,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _make_groupby_agg_call(
-    gb, aggs, split_every, split_out, shuffle_method=None
-):
-    """Helper method to consolidate the common `groupby_agg` call for all
-    aggregations in one place
-    """
-
-    return groupby_agg(
-        gb.obj,
-        gb.by,
-        aggs,
-        split_every=split_every,
-        split_out=split_out,
-        sep=gb.sep,
-        sort=gb.sort,
-        as_index=gb.as_index,
-        shuffle_method=shuffle_method,
-        **gb.dropna,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _redirect_aggs(arg):
-    """Redirect aggregations to their corresponding name in cuDF"""
-    redirects = {
-        sum: "sum",
-        max: "max",
-        min: "min",
-        "collect": list,
-        "list": list,
-    }
-    if isinstance(arg, dict):
-        new_arg = dict()
-        for col in arg:
-            if isinstance(arg[col], list):
-                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
-            elif isinstance(arg[col], dict):
-                new_arg[col] = {
-                    k: redirects.get(v, v) for k, v in arg[col].items()
-                }
-            else:
-                new_arg[col] = redirects.get(arg[col], arg[col])
-        return new_arg
-    if isinstance(arg, list):
-        return [redirects.get(agg, agg) for agg in arg]
-    return redirects.get(arg, arg)
-
-
-@_dask_cudf_performance_tracking
-def _aggs_optimized(arg, supported: set):
-    """Check that aggregations in `arg` are a subset of `supported`"""
-    if isinstance(arg, (list, dict)):
-        if isinstance(arg, dict):
-            _global_set: set[str] = set()
-            for col in arg:
-                if isinstance(arg[col], list):
-                    _global_set = _global_set.union(set(arg[col]))
-                elif isinstance(arg[col], dict):
-                    _global_set = _global_set.union(set(arg[col].values()))
-                else:
-                    _global_set.add(arg[col])
-        else:
-            _global_set = set(arg)
-
-        return bool(_global_set.issubset(supported))
-    elif isinstance(arg, (str, type)):
-        return arg in supported
-    return False
-
-
-@_dask_cudf_performance_tracking
-def _groupby_optimized(gb):
-    """Check that groupby input can use dask-cudf optimized codepath"""
-    return isinstance(gb.obj, DaskDataFrame) and (
-        isinstance(gb.by, str)
-        or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
-    )
-
-
-def _make_name(col_name, sep="_"):
-    """Combine elements of `col_name` into a single string, or no-op if
-    `col_name` is already a string
-    """
-    if isinstance(col_name, str):
-        return col_name
-    return sep.join(name for name in col_name if name != "")
-
-
-@_dask_cudf_performance_tracking
-def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
-    """Initial partition-level aggregation task.
-
-    This is the first operation to be executed on each input
-    partition in `groupby_agg`.  Depending on `aggs`, four possible
-    groupby aggregations ("count", "sum", "min", and "max") are
-    performed.  The result is then partitioned (by hashing `gb_cols`)
-    into a number of distinct dictionary elements.  The number of
-    elements in the output dictionary (`split_out`) corresponds to
-    the number of partitions in the final output of `groupby_agg`.
-    """
-
-    # Modify dict for initial (partition-wise) aggregations
-    _agg_dict = {}
-    for col, agg_list in aggs.items():
-        _agg_dict[col] = set()
-        for agg in agg_list:
-            if agg in ("mean", "std", "var"):
-                _agg_dict[col].add("count")
-                _agg_dict[col].add("sum")
-            else:
-                _agg_dict[col].add(agg)
-        _agg_dict[col] = list(_agg_dict[col])
-        if set(agg_list).intersection({"std", "var"}):
-            pow2_name = _make_name((col, "pow2"), sep=sep)
-            df[pow2_name] = df[col].astype("float64").pow(2)
-            _agg_dict[pow2_name] = ["sum"]
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        _agg_dict
-    )
-    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _tree_node_agg(df, gb_cols, dropna, sort, sep):
-    """Node in groupby-aggregation reduction tree.
-
-    The input DataFrame (`df`) corresponds to the
-    concatenated output of one or more `_groupby_partition_agg`
-    tasks. In this function, "sum", "min" and/or "max" groupby
-    aggregations will be used to combine the statistics for
-    duplicate keys.
-    """
-
-    agg_dict = {}
-    for col in df.columns:
-        if col in gb_cols:
-            continue
-        agg = col.split(sep)[-1]
-        if agg in ("count", "sum"):
-            agg_dict[col] = ["sum"]
-        elif agg == "list":
-            agg_dict[col] = [list]
-        elif agg in OPTIMIZED_AGGS:
-            agg_dict[col] = [agg]
-        else:
-            raise ValueError(f"Unexpected aggregation: {agg}")
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        agg_dict
-    )
-
-    # Don't include the last aggregation in the column names
-    output_columns = [
-        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
-        for name in gb.columns
-    ]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
-    """Calculate variance (given count, sum, and sum-squared columns)."""
-
-    # Select count, sum, and sum-squared
-    n = df[count_name]
-    x = df[sum_name]
-    x2 = df[pow2_sum_name]
-
-    # Use sum-squared approach to get variance
-    var = x2 - x**2 / n
-    div = n - ddof
-    div[div < 1] = 1  # Avoid division by 0
-    var /= div
-
-    # Set appropriate NaN elements
-    # (since we avoided 0-division)
-    var[(n - ddof) == 0] = np.nan
-
-    return var
-
-
-@_dask_cudf_performance_tracking
-def _finalize_gb_agg(
-    gb_in,
-    gb_cols,
-    aggs,
-    columns,
-    final_columns,
-    as_index,
-    dropna,
-    sort,
-    sep,
-    str_cols_out,
-    aggs_renames,
-):
-    """Final aggregation task.
-
-    This is the final operation on each output partitions
-    of the `groupby_agg` algorithm.  This function must
-    take care of higher-order aggregations, like "mean",
-    "std" and "var".  We also need to deal with the column
-    index, the row index, and final sorting behavior.
-    """
-
-    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
-
-    # Deal with higher-order aggregations
-    for col in columns:
-        agg_list = aggs.get(col, [])
-        agg_set = set(agg_list)
-        if agg_set.intersection({"mean", "std", "var"}):
-            count_name = _make_name((col, "count"), sep=sep)
-            sum_name = _make_name((col, "sum"), sep=sep)
-            if agg_set.intersection({"std", "var"}):
-                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
-                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
-                if "var" in agg_list:
-                    name_var = _make_name((col, "var"), sep=sep)
-                    gb[name_var] = var
-                if "std" in agg_list:
-                    name_std = _make_name((col, "std"), sep=sep)
-                    gb[name_std] = np.sqrt(var)
-                gb.drop(columns=[pow2_sum_name], inplace=True)
-            if "mean" in agg_list:
-                mean_name = _make_name((col, "mean"), sep=sep)
-                gb[mean_name] = gb[sum_name] / gb[count_name]
-            if "sum" not in agg_list:
-                gb.drop(columns=[sum_name], inplace=True)
-            if "count" not in agg_list:
-                gb.drop(columns=[count_name], inplace=True)
-        if list in agg_list:
-            collect_name = _make_name((col, "list"), sep=sep)
-            gb[collect_name] = gb[collect_name].list.concat()
-
-    # Ensure sorted keys if `sort=True`
-    if sort:
-        gb = gb.sort_values(gb_cols)
-
-    # Set index if necessary
-    if as_index:
-        gb.set_index(gb_cols, inplace=True)
-
-    # Unflatten column names
-    col_array = []
-    agg_array = []
-    for col in gb.columns:
-        if col in gb_cols:
-            col_array.append(col)
-            agg_array.append("")
-        else:
-            name, agg = col.split(sep)
-            col_array.append(name)
-            agg_array.append(aggs_renames.get((name, agg), agg))
-    if str_cols_out:
-        gb.columns = col_array
-    else:
-        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    return gb[final_columns]
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
index 0421bd755f4..c544c32523f 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
@@ -1,11 +1 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from .csv import read_csv  # noqa: F401
-from .json import read_json  # noqa: F401
-from .orc import read_orc, to_orc  # noqa: F401
-from .text import read_text  # noqa: F401
-
-try:
-    from .parquet import read_parquet, to_parquet  # noqa: F401
-except ImportError:
-    pass
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py
deleted file mode 100644
index fa5400344f9..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-from warnings import warn
-
-from fsspec.utils import infer_compression
-
-from dask import dataframe as dd
-from dask.base import tokenize
-from dask.dataframe.io.csv import make_reader
-from dask.utils import apply, parse_bytes
-
-import cudf
-
-
-def read_csv(path, blocksize="default", **kwargs):
-    """
-    Read CSV files into a :class:`.DataFrame`.
-
-    This API parallelizes the :func:`cudf:cudf.read_csv` function in
-    the following ways:
-
-    It supports loading many files at once using globstrings:
-
-    >>> import dask_cudf
-    >>> df = dask_cudf.read_csv("myfiles.*.csv")
-
-    In some cases it can break up large files:
-
-    >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
-
-    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
-
-    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
-    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
-
-    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
-    supports many of the same keyword arguments with the same
-    performance guarantees. See the docstring for
-    :func:`cudf:cudf.read_csv` for more information on available
-    keyword arguments.
-
-    Parameters
-    ----------
-    path : str, path object, or file-like object
-        Either a path to a file (a str, :py:class:`pathlib.Path`, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3
-        locations), or any object with a read() method (such as
-        builtin :py:func:`open` file handler function or
-        :py:class:`~io.StringIO`).
-    blocksize : int or str, default "256 MiB"
-        The target task partition size. If ``None``, a single block
-        is used for each file.
-    **kwargs : dict
-        Passthrough key-word arguments that are sent to
-        :func:`cudf:cudf.read_csv`.
-
-    Notes
-    -----
-    If any of `skipfooter`/`skiprows`/`nrows` are passed,
-    `blocksize` will default to None.
-
-    Examples
-    --------
-    >>> import dask_cudf
-    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
-    >>> ddf.compute()
-       a      b
-    0  1     hi
-    1  2  hello
-    2  3     ai
-
-    """
-
-    # Handle `chunksize` deprecation
-    if "chunksize" in kwargs:
-        chunksize = kwargs.pop("chunksize", "default")
-        warn(
-            "`chunksize` is deprecated and will be removed in the future. "
-            "Please use `blocksize` instead.",
-            FutureWarning,
-        )
-        if blocksize == "default":
-            blocksize = chunksize
-
-    # Set default `blocksize`
-    if blocksize == "default":
-        if (
-            kwargs.get("skipfooter", 0) != 0
-            or kwargs.get("skiprows", 0) != 0
-            or kwargs.get("nrows", None) is not None
-        ):
-            # Cannot read in blocks if skipfooter,
-            # skiprows or nrows is passed.
-            blocksize = None
-        else:
-            blocksize = "256 MiB"
-
-    if "://" in str(path):
-        func = make_reader(cudf.read_csv, "read_csv", "CSV")
-        return func(path, blocksize=blocksize, **kwargs)
-    else:
-        return _internal_read_csv(path=path, blocksize=blocksize, **kwargs)
-
-
-def _internal_read_csv(path, blocksize="256 MiB", **kwargs):
-    if isinstance(blocksize, str):
-        blocksize = parse_bytes(blocksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    name = "read-csv-" + tokenize(
-        path, tokenize, **kwargs
-    )  # TODO: get last modified time
-
-    compression = kwargs.get("compression", "infer")
-
-    if compression == "infer":
-        # Infer compression from first path by default
-        compression = infer_compression(filenames[0])
-
-    if compression and blocksize:
-        # compressed CSVs reading must read the entire file
-        kwargs.pop("byte_range", None)
-        warn(
-            "Warning %s compression does not support breaking apart files\n"
-            "Please ensure that each individual file can fit in memory and\n"
-            "use the keyword ``blocksize=None to remove this message``\n"
-            "Setting ``blocksize=(size of file)``" % compression
-        )
-        blocksize = None
-
-    if blocksize is None:
-        return read_csv_without_blocksize(path, **kwargs)
-
-    # Let dask.dataframe generate meta
-    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    kwargs1 = kwargs.copy()
-    usecols = kwargs1.pop("usecols", None)
-    dtype = kwargs1.pop("dtype", None)
-    meta = dask_reader(filenames[0], **kwargs1)._meta
-    names = meta.columns
-    if usecols or dtype:
-        # Regenerate meta with original kwargs if
-        # `usecols` or `dtype` was specified
-        meta = dask_reader(filenames[0], **kwargs)._meta
-
-    dsk = {}
-    i = 0
-    dtypes = meta.dtypes.values
-
-    for fn in filenames:
-        size = os.path.getsize(fn)
-        for start in range(0, size, blocksize):
-            kwargs2 = kwargs.copy()
-            kwargs2["byte_range"] = (
-                start,
-                blocksize,
-            )  # specify which chunk of the file we care about
-            if start != 0:
-                kwargs2["names"] = names  # no header in the middle of the file
-                kwargs2["header"] = None
-            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
-
-            i += 1
-
-    divisions = [None] * (len(dsk) + 1)
-    return dd.core.new_dd_object(dsk, name, meta, divisions)
-
-
-def _read_csv(fn, dtypes=None, **kwargs):
-    return cudf.read_csv(fn, **kwargs)
-
-
-def read_csv_without_blocksize(path, **kwargs):
-    """Read entire CSV with optional compression (gzip/zip)
-
-    Parameters
-    ----------
-    path : str
-        path to files (support for glob)
-    """
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    name = "read-csv-" + tokenize(path, **kwargs)
-
-    meta_kwargs = kwargs.copy()
-    if "skipfooter" in meta_kwargs:
-        meta_kwargs.pop("skipfooter")
-    if "nrows" in meta_kwargs:
-        meta_kwargs.pop("nrows")
-    # Read "head" of first file (first 5 rows).
-    # Convert to empty df for metadata.
-    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
-
-    graph = {
-        (name, i): (apply, cudf.read_csv, [fn], kwargs)
-        for i, fn in enumerate(filenames)
-    }
-
-    divisions = [None] * (len(filenames) + 1)
-
-    return dd.core.new_dd_object(graph, name, meta, divisions)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py
deleted file mode 100644
index 98c5ceedb76..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/json.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-from functools import partial
-
-import numpy as np
-from fsspec.core import get_compression, get_fs_token_paths
-
-import dask
-from dask.utils import parse_bytes
-
-import cudf
-from cudf.core.column import as_column
-from cudf.utils.ioutils import _is_local_filesystem
-
-from dask_cudf.backends import _default_backend
-
-
-def _read_json_partition(
-    paths,
-    fs=None,
-    include_path_column=False,
-    path_converter=None,
-    **kwargs,
-):
-    # Transfer all data up front for remote storage
-    sources = (
-        paths
-        if fs is None
-        else fs.cat_ranges(
-            paths,
-            [0] * len(paths),
-            fs.sizes(paths),
-        )
-    )
-
-    if include_path_column:
-        # Add "path" column.
-        # Must iterate over sources sequentially
-        if not isinstance(include_path_column, str):
-            include_path_column = "path"
-        converted_paths = (
-            paths
-            if path_converter is None
-            else [path_converter(path) for path in paths]
-        )
-        dfs = []
-        for i, source in enumerate(sources):
-            df = cudf.read_json(source, **kwargs)
-            df[include_path_column] = as_column(
-                converted_paths[i], length=len(df)
-            )
-            dfs.append(df)
-        return cudf.concat(dfs)
-    else:
-        # Pass sources directly to cudf
-        return cudf.read_json(sources, **kwargs)
-
-
-def read_json(
-    url_path,
-    engine="auto",
-    blocksize=None,
-    orient="records",
-    lines=None,
-    compression="infer",
-    aggregate_files=True,
-    **kwargs,
-):
-    """Read JSON data into a :class:`.DataFrame`.
-
-    This function wraps :func:`dask.dataframe.read_json`, and passes
-    ``engine=partial(cudf.read_json, engine="auto")`` by default.
-
-    Parameters
-    ----------
-    url_path : str, list of str
-        Location to read from. If a string, can include a glob character to
-        find a set of file names.
-        Supports protocol specifications such as ``"s3://"``.
-    engine : str or Callable, default "auto"
-
-        If str, this value will be used as the ``engine`` argument
-        when :func:`cudf.read_json` is used to create each partition.
-        If a :obj:`~collections.abc.Callable`, this value will be used as the
-        underlying function used to create each partition from JSON
-        data. The default value is "auto", so that
-        ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to :func:`dask.dataframe.read_json` by default.
-    aggregate_files : bool or int
-        Whether to map multiple files to each output partition. If True,
-        the `blocksize` argument will be used to determine the number of
-        files in each partition. If any one file is larger than `blocksize`,
-        the `aggregate_files` argument will be ignored. If an integer value
-        is specified, the `blocksize` argument will be ignored, and that
-        number of files will be mapped to each partition. Default is True.
-    **kwargs :
-        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
-
-    Returns
-    -------
-    :class:`.DataFrame`
-
-    Examples
-    --------
-    Load single file
-
-    >>> from dask_cudf import read_json
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    Load large line-delimited JSON files using partitions of approx
-    256MB size
-
-    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
-
-    Load nested JSON data
-
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    See Also
-    --------
-    dask.dataframe.read_json
-
-    """
-
-    if lines is None:
-        lines = orient == "records"
-    if orient != "records" and lines:
-        raise ValueError(
-            'Line-delimited JSON is only available with orient="records".'
-        )
-    if blocksize and (orient != "records" or not lines):
-        raise ValueError(
-            "JSON file chunking only allowed for JSON-lines"
-            "input (orient='records', lines=True)."
-        )
-
-    inputs = []
-    if aggregate_files and blocksize or int(aggregate_files) > 1:
-        # Attempt custom read if we are mapping multiple files
-        # to each output partition. Otherwise, upstream logic
-        # is sufficient.
-
-        storage_options = kwargs.get("storage_options", {})
-        fs, _, paths = get_fs_token_paths(
-            url_path, mode="rb", storage_options=storage_options
-        )
-        if isinstance(aggregate_files, int) and aggregate_files > 1:
-            # Map a static file count to each partition
-            inputs = [
-                paths[offset : offset + aggregate_files]
-                for offset in range(0, len(paths), aggregate_files)
-            ]
-        elif aggregate_files is True and blocksize:
-            # Map files dynamically (using blocksize)
-            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
-            blocksize = parse_bytes(blocksize)
-            if all([file_size <= blocksize for file_size in file_sizes]):
-                counts = np.unique(
-                    np.floor(np.cumsum(file_sizes) / blocksize),
-                    return_counts=True,
-                )[1]
-                offsets = np.concatenate([[0], counts.cumsum()])
-                inputs = [
-                    paths[offsets[i] : offsets[i + 1]]
-                    for i in range(len(offsets) - 1)
-                ]
-
-    if inputs:
-        # Inputs were successfully populated.
-        # Use custom _read_json_partition function
-        # to generate each partition.
-
-        compression = get_compression(
-            url_path[0] if isinstance(url_path, list) else url_path,
-            compression,
-        )
-        _kwargs = dict(
-            orient=orient,
-            lines=lines,
-            compression=compression,
-            include_path_column=kwargs.get("include_path_column", False),
-            path_converter=kwargs.get("path_converter"),
-        )
-        if not _is_local_filesystem(fs):
-            _kwargs["fs"] = fs
-        # TODO: Generate meta more efficiently
-        meta = _read_json_partition(inputs[0][:1], **_kwargs)
-        return dask.dataframe.from_map(
-            _read_json_partition,
-            inputs,
-            meta=meta,
-            **_kwargs,
-        )
-
-    # Fall back to dask.dataframe.read_json
-    return _default_backend(
-        dask.dataframe.read_json,
-        url_path,
-        engine=(
-            partial(cudf.read_json, engine=engine)
-            if isinstance(engine, str)
-            else engine
-        ),
-        blocksize=blocksize,
-        orient=orient,
-        lines=lines,
-        compression=compression,
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py
deleted file mode 100644
index fcf684fd6c8..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from io import BufferedWriter, IOBase
-
-from fsspec.core import get_fs_token_paths
-from fsspec.utils import stringify_path
-from pyarrow import orc as orc
-
-from dask import dataframe as dd
-from dask.dataframe.io.utils import _get_pyarrow_dtypes
-
-import cudf
-
-
-def _read_orc_stripe(source, fs, columns=None, kwargs=None):
-    """Pull out specific columns from specific stripe"""
-    path, stripe = source
-    if kwargs is None:
-        kwargs = {}
-    with fs.open(path, "rb") as f:
-        df_stripe = cudf.read_orc(
-            f, stripes=[stripe], columns=columns, **kwargs
-        )
-    return df_stripe
-
-
-def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read ORC files into a :class:`.DataFrame`.
-
-    Note that this function is mostly borrowed from upstream Dask.
-
-    Parameters
-    ----------
-    path : str or list[str]
-        Location of file(s), which can be a full URL with protocol specifier,
-        and may include glob character if a single string.
-    columns : None or list[str]
-        Columns to load. If None, loads all.
-    filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out
-        row groups using statistics stored for each row group as
-        Parquet metadata. Row groups that do not match the given
-        filter predicate are not read. The predicate is expressed in
-        `disjunctive normal form (DNF)
-        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
-        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
-        boolean logical combinations of single column predicates. The
-        innermost tuples each describe a single column predicate. The
-        list of inner predicates is interpreted as a conjunction
-        (AND), forming a more selective and multiple column predicate.
-        Finally, the outermost list combines these filters as a
-        disjunction (OR). Predicates may also be passed as a list of
-        tuples. This form is interpreted as a single conjunction. To
-        express OR in predicates, one must use the (preferred)
-        notation of list of lists of tuples.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-
-    See Also
-    --------
-    dask.dataframe.read_orc
-
-    Returns
-    -------
-    dask_cudf.DataFrame
-
-    """
-
-    storage_options = storage_options or {}
-    fs, _, paths = get_fs_token_paths(
-        path, mode="rb", storage_options=storage_options
-    )
-    schema = None
-    nstripes_per_file = []
-    for path in paths:
-        with fs.open(path, "rb") as f:
-            o = orc.ORCFile(f)
-            if schema is None:
-                schema = o.schema
-            elif schema != o.schema:
-                raise ValueError(
-                    "Incompatible schemas while parsing ORC files"
-                )
-            nstripes_per_file.append(o.nstripes)
-    schema = _get_pyarrow_dtypes(schema, categories=None)
-    if columns is not None:
-        ex = set(columns) - set(schema)
-        if ex:
-            raise ValueError(
-                f"Requested columns ({ex}) not in schema ({set(schema)})"
-            )
-    else:
-        columns = list(schema)
-
-    with fs.open(paths[0], "rb") as f:
-        meta = cudf.read_orc(
-            f,
-            stripes=[0] if nstripes_per_file[0] else None,
-            columns=columns,
-            **kwargs,
-        )
-
-    sources = []
-    for path, n in zip(paths, nstripes_per_file):
-        for stripe in (
-            range(n)
-            if filters is None
-            else cudf.io.orc._filter_stripes(filters, path)
-        ):
-            sources.append((path, stripe))
-
-    return dd.from_map(
-        _read_orc_stripe,
-        sources,
-        args=[fs],
-        columns=columns,
-        kwargs=kwargs,
-        meta=meta,
-    )
-
-
-def write_orc_partition(df, path, fs, filename, compression="snappy"):
-    full_path = fs.sep.join([path, filename])
-    with fs.open(full_path, mode="wb") as out_file:
-        if not isinstance(out_file, IOBase):
-            out_file = BufferedWriter(out_file)
-        cudf.io.to_orc(df, out_file, compression=compression)
-    return full_path
-
-
-def to_orc(
-    df,
-    path,
-    write_index=True,
-    storage_options=None,
-    compression="snappy",
-    compute=True,
-    **kwargs,
-):
-    """
-    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
-
-    Parameters
-    ----------
-    df : DataFrame
-    path : str or pathlib.Path
-        Destination directory for data.  Prepend with protocol like ``s3://``
-        or ``hdfs://`` for remote data.
-    write_index : boolean, optional
-        Whether or not to write the index. Defaults to True.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-    compression : string or dict, optional
-    compute : bool, optional
-        If True (default) then the result is computed immediately. If
-        False then a :class:`~dask.delayed.Delayed` object is returned
-        for future computation.
-
-    """
-
-    from dask import compute as dask_compute, delayed
-
-    # TODO: Use upstream dask implementation once available
-    #       (see: Dask Issue#5596)
-
-    if hasattr(path, "name"):
-        path = stringify_path(path)
-    fs, _, _ = get_fs_token_paths(
-        path, mode="wb", storage_options=storage_options
-    )
-    # Trim any protocol information from the path before forwarding
-    path = fs._strip_protocol(path)
-
-    if write_index:
-        df = df.reset_index()
-    else:
-        # Not writing index - might as well drop it
-        df = df.reset_index(drop=True)
-
-    fs.mkdirs(path, exist_ok=True)
-
-    # Use i_offset and df.npartitions to define file-name list
-    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
-
-    # write parts
-    dwrite = delayed(write_orc_partition)
-    parts = [
-        dwrite(d, path, fs, filename, compression=compression)
-        for d, filename in zip(df.to_delayed(), filenames)
-    ]
-
-    if compute:
-        return dask_compute(*parts)
-
-    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0638e4a1c3..c0792663c7e 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 import itertools
 import warnings
 from functools import partial
@@ -8,7 +8,7 @@
 import pandas as pd
 from pyarrow import dataset as pa_ds, parquet as pq
 
-from dask import dataframe as dd
+import dask.dataframe as dd
 from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 
 try:
@@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 df._data[col_name] = col.astype(typ)
 
 
-def read_parquet(path, columns=None, **kwargs):
-    """
-    Read parquet files into a :class:`.DataFrame`.
-
-    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
-    to coordinate the execution of :func:`cudf.read_parquet`, and to
-    ultimately create a :class:`.DataFrame` collection.
-
-    See the :func:`dask.dataframe.read_parquet` documentation for
-    all available options.
-
-    Examples
-    --------
-    >>> from dask_cudf import read_parquet
-    >>> df = read_parquet("/path/to/dataset/")  # doctest: +SKIP
-
-    When dealing with one or more large parquet files having an
-    in-memory footprint >15% device memory, the ``split_row_groups``
-    argument should be used to map Parquet **row-groups** to DataFrame
-    partitions (instead of **files** to partitions). For example, the
-    following code will map each row-group to a distinct partition:
-
-    >>> df = read_parquet(..., split_row_groups=True)  # doctest: +SKIP
-
-    To map **multiple** row-groups to each partition, an integer can be
-    passed to ``split_row_groups`` to specify the **maximum** number of
-    row-groups allowed in each output partition:
-
-    >>> df = read_parquet(..., split_row_groups=10)  # doctest: +SKIP
-
-    See Also
-    --------
-    cudf.read_parquet
-    dask.dataframe.read_parquet
-    """
-    if isinstance(columns, str):
-        columns = [columns]
-
-    # Set "check_file_size" option to determine whether we
-    # should check the parquet-file size. This check is meant
-    # to "protect" users from `split_row_groups` default changes
-    check_file_size = kwargs.pop("check_file_size", 500_000_000)
-    if (
-        check_file_size
-        and ("split_row_groups" not in kwargs)
-        and ("chunksize" not in kwargs)
-    ):
-        # User is not specifying `split_row_groups` or `chunksize`,
-        # so we should warn them if/when a file is ~>0.5GB on disk.
-        # They can set `split_row_groups` explicitly to silence/skip
-        # this check
-        if "read" not in kwargs:
-            kwargs["read"] = {}
-        kwargs["read"]["check_file_size"] = check_file_size
-
-    return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs)
-
-
-to_parquet = partial(dd.to_parquet, engine=CudfEngine)
+to_parquet = dd.to_parquet
 
 if create_metadata_file_dd is None:
     create_metadata_file = create_metadata_file_dd
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py
deleted file mode 100644
index 3757c85c80c..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/text.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-
-import dask.dataframe as dd
-from dask.utils import parse_bytes
-
-import cudf
-
-
-def _read_text(source, **kwargs):
-    # Wrapper for cudf.read_text operation
-    fn, byte_range = source
-    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
-
-
-def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
-    if isinstance(chunksize, str):
-        chunksize = parse_bytes(chunksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    if chunksize and byte_range:
-        raise ValueError("Cannot specify both chunksize and byte_range.")
-
-    if chunksize:
-        sources = []
-        for fn in filenames:
-            size = os.path.getsize(fn)
-            for start in range(0, size, chunksize):
-                byte_range = (
-                    start,
-                    chunksize,
-                )  # specify which chunk of the file we care about
-                sources.append((fn, byte_range))
-    else:
-        sources = [(fn, byte_range) for fn in filenames]
-
-    return dd.from_map(
-        _read_text,
-        sources,
-        meta=cudf.Series([], dtype="O"),
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py
deleted file mode 100644
index a2ba4d1878e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/sorting.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import warnings
-from collections.abc import Iterator
-from functools import wraps
-
-import cupy
-import numpy as np
-import tlz as toolz
-
-from dask import config
-from dask.base import tokenize
-from dask.dataframe import methods
-from dask.dataframe.core import DataFrame, Index, Series
-from dask.dataframe.shuffle import rearrange_by_column
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M
-
-import cudf
-from cudf.api.types import _is_categorical_dtype
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
-
-
-def _deprecate_shuffle_kwarg(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        old_arg_value = kwargs.pop("shuffle", None)
-
-        if old_arg_value is not None:
-            new_arg_value = old_arg_value
-            msg = (
-                "the 'shuffle' keyword is deprecated, "
-                "use 'shuffle_method' instead."
-            )
-
-            warnings.warn(msg, FutureWarning)
-            if kwargs.get("shuffle_method") is not None:
-                msg = (
-                    "Can only specify 'shuffle' "
-                    "or 'shuffle_method', not both."
-                )
-                raise TypeError(msg)
-            kwargs["shuffle_method"] = new_arg_value
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-@_dask_cudf_performance_tracking
-def set_index_post(df, index_name, drop, column_dtype):
-    df2 = df.set_index(index_name, drop=drop)
-    df2.columns = df2.columns.astype(column_dtype)
-    return df2
-
-
-@_dask_cudf_performance_tracking
-def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
-    if ascending:
-        partitions = divisions.searchsorted(s, side="right") - 1
-    else:
-        partitions = (
-            len(divisions) - divisions.searchsorted(s, side="right") - 1
-        )
-    partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = (
-        0 if ascending else (len(divisions) - 2)
-    )
-    partitions[s._columns[0].isnull().values] = (
-        len(divisions) - 2 if na_position == "last" else 0
-    )
-    return partitions
-
-
-@_dask_cudf_performance_tracking
-def _quantile(a, q):
-    n = len(a)
-    if not len(a):
-        return None, n
-    return (
-        a.quantile(q=q.tolist(), interpolation="nearest", method="table"),
-        n,
-    )
-
-
-@_dask_cudf_performance_tracking
-def merge_quantiles(finalq, qs, vals):
-    """Combine several quantile calculations of different data.
-    [NOTE: Same logic as dask.array merge_percentiles]
-    """
-    if isinstance(finalq, Iterator):
-        finalq = list(finalq)
-    finalq = np.array(finalq)
-    qs = list(map(list, qs))
-    vals = list(vals)
-    vals, Ns = zip(*vals)
-    Ns = list(Ns)
-
-    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
-    if not L:
-        raise ValueError("No non-trivial arrays found")
-    qs, vals, Ns = L
-
-    if len(vals) != len(qs) or len(Ns) != len(qs):
-        raise ValueError("qs, vals, and Ns parameters must be the same length")
-
-    # transform qs and Ns into number of observations between quantiles
-    counts = []
-    for q, N in zip(qs, Ns):
-        count = np.empty(len(q))
-        count[1:] = np.diff(q)
-        count[0] = q[0]
-        count *= N
-        counts.append(count)
-
-    def _append_counts(val, count):
-        val["_counts"] = count
-        return val
-
-    # Sort by calculated quantile values, then number of observations.
-    combined_vals_counts = cudf.core.reshape._merge_sorted(
-        [*map(_append_counts, vals, counts)]
-    )
-    combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values)
-    combined_vals = combined_vals_counts.drop(columns=["_counts"])
-
-    # quantile-like, but scaled by total number of observations
-    combined_q = np.cumsum(combined_counts)
-
-    # rescale finalq quantiles to match combined_q
-    desired_q = finalq * sum(Ns)
-
-    # TODO: Support other interpolation methods
-    # For now - Always use "nearest" for interpolation
-    left = np.searchsorted(combined_q, desired_q, side="left")
-    right = np.searchsorted(combined_q, desired_q, side="right") - 1
-    np.minimum(left, len(combined_vals) - 1, left)  # don't exceed max index
-    lower = np.minimum(left, right)
-    upper = np.maximum(left, right)
-    lower_residual = np.abs(combined_q[lower] - desired_q)
-    upper_residual = np.abs(combined_q[upper] - desired_q)
-    mask = lower_residual > upper_residual
-    index = lower  # alias; we no longer need lower
-    index[mask] = upper[mask]
-    rv = combined_vals.iloc[index]
-    return rv.reset_index(drop=True)
-
-
-@_dask_cudf_performance_tracking
-def _approximate_quantile(df, q):
-    """Approximate quantiles of DataFrame or Series.
-    [NOTE: Same logic as dask.dataframe Series quantile]
-    """
-    # current implementation needs q to be sorted so
-    # sort if array-like, otherwise leave it alone
-    q_ndarray = np.array(q)
-    if q_ndarray.ndim > 0:
-        q_ndarray.sort(kind="mergesort")
-        q = q_ndarray
-
-    # Lets assume we are dealing with a DataFrame throughout
-    if isinstance(df, (Series, Index)):
-        df = df.to_frame()
-    assert isinstance(df, DataFrame)
-    final_type = df._meta._constructor
-
-    # Create metadata
-    meta = df._meta_nonempty.quantile(q=q, method="table")
-
-    # Define final action (create df with quantiles as index)
-    def finalize_tsk(tsk):
-        return (final_type, tsk)
-
-    return_type = df.__class__
-
-    # pandas/cudf uses quantile in [0, 1]
-    # numpy / cupy uses [0, 100]
-    qs = np.asarray(q)
-    token = tokenize(df, qs)
-
-    if len(qs) == 0:
-        name = "quantiles-" + token
-        empty_index = cudf.Index([], dtype=float)
-        return Series(
-            {
-                (name, 0): final_type(
-                    {col: [] for col in df.columns},
-                    name=df.name,
-                    index=empty_index,
-                )
-            },
-            name,
-            df._meta,
-            [None, None],
-        )
-    else:
-        new_divisions = [np.min(q), np.max(q)]
-
-    name = "quantiles-1-" + token
-    val_dsk = {
-        (name, i): (_quantile, key, qs)
-        for i, key in enumerate(df.__dask_keys__())
-    }
-
-    name2 = "quantiles-2-" + token
-    merge_dsk = {
-        (name2, 0): finalize_tsk(
-            (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk))
-        )
-    }
-    dsk = toolz.merge(val_dsk, merge_dsk)
-    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df])
-    df = return_type(graph, name2, meta, new_divisions)
-
-    def set_quantile_index(df):
-        df.index = q
-        return df
-
-    df = df.map_partitions(set_quantile_index, meta=meta)
-    return df
-
-
-@_dask_cudf_performance_tracking
-def quantile_divisions(df, by, npartitions):
-    qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
-    divisions = _approximate_quantile(df[by], qn).compute()
-    columns = divisions.columns
-
-    # TODO: Make sure divisions are correct for all dtypes..
-    if (
-        len(columns) == 1
-        and df[columns[0]].dtype != "object"
-        and not _is_categorical_dtype(df[columns[0]].dtype)
-    ):
-        dtype = df[columns[0]].dtype
-        divisions = divisions[columns[0]].astype("int64")
-        divisions.iloc[-1] += 1
-        divisions = sorted(
-            divisions.drop_duplicates().astype(dtype).to_arrow().tolist(),
-            key=lambda x: (x is None, x),
-        )
-    else:
-        for col in columns:
-            dtype = df[col].dtype
-            if dtype != "object":
-                divisions[col] = divisions[col].astype("int64")
-                divisions[col].iloc[-1] += 1
-                divisions[col] = divisions[col].astype(dtype)
-            else:
-                if last := divisions[col].iloc[-1]:
-                    val = chr(ord(last[0]) + 1)
-                else:
-                    val = "this string intentionally left empty"  # any but ""
-                divisions[col].iloc[-1] = val
-        divisions = divisions.drop_duplicates().sort_index()
-    return divisions
-
-
-@_deprecate_shuffle_kwarg
-@_dask_cudf_performance_tracking
-def sort_values(
-    df,
-    by,
-    max_branch=None,
-    divisions=None,
-    set_divisions=False,
-    ignore_index=False,
-    ascending=True,
-    na_position="last",
-    shuffle_method=None,
-    sort_function=None,
-    sort_function_kwargs=None,
-):
-    """Sort by the given list/tuple of column names."""
-
-    if not isinstance(ascending, bool):
-        raise ValueError("ascending must be either True or False")
-    if na_position not in ("first", "last"):
-        raise ValueError("na_position must be either 'first' or 'last'")
-
-    npartitions = df.npartitions
-    if isinstance(by, tuple):
-        by = list(by)
-    elif not isinstance(by, list):
-        by = [by]
-
-    # parse custom sort function / kwargs if provided
-    sort_kwargs = {
-        "by": by,
-        "ascending": ascending,
-        "na_position": na_position,
-    }
-    if sort_function is None:
-        sort_function = M.sort_values
-    if sort_function_kwargs is not None:
-        sort_kwargs.update(sort_function_kwargs)
-
-    # handle single partition case
-    if npartitions == 1:
-        return df.map_partitions(sort_function, **sort_kwargs)
-
-    # Step 1 - Calculate new divisions (if necessary)
-    if divisions is None:
-        divisions = quantile_divisions(df, by, npartitions)
-
-    # Step 2 - Perform repartitioning shuffle
-    meta = df._meta._constructor_sliced([0])
-    if not isinstance(divisions, (cudf.Series, cudf.DataFrame)):
-        dtype = df[by[0]].dtype
-        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)
-
-    partitions = df[by].map_partitions(
-        _set_partitions_pre,
-        divisions=divisions,
-        ascending=ascending,
-        na_position=na_position,
-        meta=meta,
-    )
-
-    df2 = df.assign(_partitions=partitions)
-    df3 = rearrange_by_column(
-        df2,
-        "_partitions",
-        max_branch=max_branch,
-        npartitions=len(divisions) - 1,
-        shuffle_method=_get_shuffle_method(shuffle_method),
-        ignore_index=ignore_index,
-    ).drop(columns=["_partitions"])
-    df3.divisions = (None,) * (df3.npartitions + 1)
-
-    # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(sort_function, **sort_kwargs)
-    if not isinstance(divisions, cudf.DataFrame) and set_divisions:
-        # Can't have multi-column divisions elsewhere in dask (yet)
-        df4.divisions = tuple(methods.tolist(divisions))
-
-    return df4
-
-
-def get_default_shuffle_method():
-    # Note that `dask.utils.get_default_shuffle_method`
-    # will return "p2p" by default when a distributed
-    # client is present. Dask-cudf supports "p2p", but
-    # will not use it by default (yet)
-    default = config.get("dataframe.shuffle.method", "tasks")
-    if default not in _SHUFFLE_SUPPORT:
-        default = "tasks"
-    return default
-
-
-def _get_shuffle_method(shuffle_method):
-    # Utility to set the shuffle_method-kwarg default
-    # and to validate user-specified options
-    shuffle_method = shuffle_method or get_default_shuffle_method()
-    if shuffle_method not in _SHUFFLE_SUPPORT:
-        raise ValueError(
-            "Dask-cudf only supports the following shuffle "
-            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}"
-        )
-
-    return shuffle_method
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 962a229a839..f33733d9583 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import warnings
 from collections.abc import Iterator
@@ -11,14 +11,12 @@
 from packaging.version import Version
 from pandas.api.types import is_scalar
 
-import dask.dataframe as dd
 from dask import config
 from dask.array.dispatch import percentile_lookup
 from dask.dataframe.backends import (
     DataFrameBackendEntrypoint,
     PandasBackendEntrypoint,
 )
-from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
     concat_dispatch,
@@ -28,6 +26,8 @@
     hash_object_dispatch,
     is_categorical_dtype_dispatch,
     make_meta_dispatch,
+    meta_nonempty,
+    partd_encode_dispatch,
     pyarrow_schema_dispatch,
     to_pyarrow_table_dispatch,
     tolist_dispatch,
@@ -46,13 +46,6 @@
 from cudf.api.types import is_string_dtype
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
-from ._legacy.core import DataFrame, Index, Series
-
-get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
-get_parallel_type.register(cudf.Series, lambda _: Series)
-get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
-
-
 # Required for Arrow filesystem support in read_parquet
 PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
 
@@ -318,7 +311,7 @@ def tolist_cudf(obj):
 
 
 @is_categorical_dtype_dispatch.register(
-    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
+    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype)  # , Series)
 )
 @_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
@@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
 
-# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0
-try:
-    from dask.dataframe.dispatch import partd_encode_dispatch
-
-    @partd_encode_dispatch.register(cudf.DataFrame)
-    def _simple_cudf_encode(_):
-        # Basic pickle-based encoding for a partd k-v store
-        import pickle
+@partd_encode_dispatch.register(cudf.DataFrame)
+def _simple_cudf_encode(_):
+    # Basic pickle-based encoding for a partd k-v store
+    import pickle
 
-        import partd
+    import partd
 
-        def join(dfs):
-            if not dfs:
-                return cudf.DataFrame()
-            else:
-                return cudf.concat(dfs)
-
-        dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
-        return partial(partd.Encode, dumps, pickle.loads, join)
+    def join(dfs):
+        if not dfs:
+            return cudf.DataFrame()
+        else:
+            return cudf.concat(dfs)
 
-except ImportError:
-    pass
+    dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
+    return partial(partd.Encode, dumps, pickle.loads, join)
 
 
 def _default_backend(func, *args, **kwargs):
@@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs):
     return data
 
 
-# Define "cudf" backend engine to be registered with Dask
-class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
-    """Backend-entrypoint class for Dask-DataFrame
+# Define the "cudf" backend for "legacy" Dask DataFrame
+class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint):
+    """Backend-entrypoint class for legacy Dask-DataFrame
 
     This class is registered under the name "cudf" for the
-    ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
-    Dask-DataFrame will use the methods defined in this class
-    in place of ``dask.dataframe.<creation-method>`` when the
-    "dataframe.backend" configuration is set to "cudf":
-
-    Examples
-    --------
-    >>> import dask
-    >>> import dask.dataframe as dd
-    >>> with dask.config.set({"dataframe.backend": "cudf"}):
-    ...     ddf = dd.from_dict({"a": range(10)})
-    >>> type(ddf)
-    <class 'dask_cudf._legacy.core.DataFrame'>
+    ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``.
+    This "legacy" backend is only used for CSV support.
     """
 
-    @classmethod
-    def to_backend_dispatch(cls):
-        return to_cudf_dispatch
-
-    @classmethod
-    def to_backend(cls, data: dd.core._Frame, **kwargs):
-        if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)):
-            # Already a cudf-backed collection
-            _unsupported_kwargs("cudf", "cudf", kwargs)
-            return data
-        return data.map_partitions(cls.to_backend_dispatch(), **kwargs)
-
-    @staticmethod
-    def from_dict(
-        data,
-        npartitions,
-        orient="columns",
-        dtype=None,
-        columns=None,
-        constructor=cudf.DataFrame,
-    ):
-        return _default_backend(
-            dd.from_dict,
-            data,
-            npartitions=npartitions,
-            orient=orient,
-            dtype=dtype,
-            columns=columns,
-            constructor=constructor,
-        )
-
-    @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
-        from dask_cudf._legacy.io.parquet import CudfEngine
-
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dd.read_parquet,
-            *args,
-            engine=CudfEngine,
-            **kwargs,
-        )
-
-    @staticmethod
-    def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json
-
-        return read_json(*args, **kwargs)
-
-    @staticmethod
-    def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io import read_orc
 
-        return read_orc(*args, **kwargs)
-
-    @staticmethod
-    def read_csv(*args, **kwargs):
-        from dask_cudf._legacy.io import read_csv
-
-        return read_csv(*args, **kwargs)
-
-    @staticmethod
-    def read_hdf(*args, **kwargs):
-        # HDF5 reader not yet implemented in cudf
-        warnings.warn(
-            "read_hdf is not yet implemented in cudf/dask_cudf. "
-            "Moving to cudf from pandas. Expect poor performance!"
-        )
-        return _default_backend(dd.read_hdf, *args, **kwargs).to_backend(
-            "cudf"
-        )
-
-
-# Define "cudf" backend entrypoint for dask-expr
-class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
+# Define the "cudf" backend for expr-based Dask DataFrame
+class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
     """Backend-entrypoint class for Dask-Expressions
 
     This class is registered under the name "cudf" for the
-    ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``.
+    ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``.
     Dask-DataFrame will use the methods defined in this class
     in place of ``dask_expr.<creation-method>`` when the
     "dataframe.backend" configuration is set to "cudf":
@@ -714,30 +617,44 @@ def read_csv(
         storage_options=None,
         **kwargs,
     ):
-        import dask_expr as dx
-        from fsspec.utils import stringify_path
+        try:
+            # TODO: Remove when cudf is pinned to dask>2024.12.0
+            import dask_expr as dx
+            from dask_expr.io.csv import ReadCSV
+            from fsspec.utils import stringify_path
+
+            if not isinstance(path, str):
+                path = stringify_path(path)
+            return dx.new_collection(
+                ReadCSV(
+                    path,
+                    dtype_backend=dtype_backend,
+                    storage_options=storage_options,
+                    kwargs=kwargs,
+                    header=header,
+                    dataframe_backend="cudf",
+                )
+            )
+        except ImportError:
+            # Requires dask>2024.12.0
+            from dask_cudf.io.csv import read_csv
 
-        if not isinstance(path, str):
-            path = stringify_path(path)
-        return dx.new_collection(
-            dx.io.csv.ReadCSV(
+            return read_csv(
                 path,
-                dtype_backend=dtype_backend,
-                storage_options=storage_options,
-                kwargs=kwargs,
+                *args,
                 header=header,
-                dataframe_backend="cudf",
+                storage_options=storage_options,
+                **kwargs,
             )
-        )
 
     @staticmethod
     def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json as read_json_impl
+        from dask_cudf.io.json import read_json as read_json_impl
 
         return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
 
         return legacy_read_orc(*args, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5fd217209ec..32461104ef9 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,56 +1,41 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import textwrap
+import warnings
+from importlib import import_module
 
 import dask.dataframe as dd
-from dask.tokenize import tokenize
 
 import cudf
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 # This module provides backward compatibility for legacy import patterns.
-if dd.DASK_EXPR_ENABLED:
-    from dask_cudf._expr.collection import (
-        DataFrame,
-        Index,
-        Series,
-    )
-else:
-    from dask_cudf._legacy.core import DataFrame, Index, Series  # noqa: F401
-
+from dask_cudf._expr.collection import (
+    DataFrame,  # noqa: F401
+    Index,  # noqa: F401
+    Series,  # noqa: F401
+)
 
 concat = dd.concat
 
 
 @_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
-    from dask_cudf import QUERY_PLANNING_ON
-
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
             "dask_cudf does not support MultiIndex Dataframes."
         )
 
-    # Dask-expr doesn't support the `name` argument
-    name = {}
-    if not QUERY_PLANNING_ON:
-        name = {
-            "name": name
-            or ("from_cudf-" + tokenize(data, npartitions or chunksize))
-        }
-
     return dd.from_pandas(
         data,
         npartitions=npartitions,
         chunksize=chunksize,
         sort=sort,
-        **name,
     )
 
 
-from_cudf.__doc__ = (
-    textwrap.dedent(
-        """
+from_cudf.__doc__ = textwrap.dedent(
+    """
         Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
 
         This function is a thin wrapper around
@@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
         arguments (described below) excepting that it operates on cuDF
         rather than pandas objects.\n
         """
-    )
-    # TODO: `dd.from_pandas.__doc__` is empty when
-    # `DASK_DATAFRAME__QUERY_PLANNING=True`
-    # since dask-expr does not provide a docstring for from_pandas.
-    + textwrap.dedent(dd.from_pandas.__doc__ or "")
-)
+) + textwrap.dedent(dd.from_pandas.__doc__)
+
+
+def _deprecated_api(old_api, new_api=None, rec=None):
+    def inner_func(*args, **kwargs):
+        if new_api:
+            # Use alternative
+            msg = f"{old_api} is now deprecated. "
+            msg += rec or f"Please use {new_api} instead."
+            warnings.warn(msg, FutureWarning)
+            new_attr = new_api.split(".")
+            module = import_module(".".join(new_attr[:-1]))
+            return getattr(module, new_attr[-1])(*args, **kwargs)
+
+        # No alternative - raise an error
+        raise NotImplementedError(
+            f"{old_api} is no longer supported. " + (rec or "")
+        )
+
+    return inner_func
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 9bca33e414a..a5175c9bbe7 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
+from dask_cudf.core import _deprecated_api
 
 from . import csv, json, orc, parquet, text  # noqa: F401
 
@@ -15,20 +15,13 @@
 )
 to_orc = _deprecated_api(
     "dask_cudf.io.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.orc.to_orc",
     rec="Please use the DataFrame.to_orc method instead.",
 )
 read_text = _deprecated_api(
     "dask_cudf.io.read_text", new_api="dask_cudf.read_text"
 )
-if QUERY_PLANNING_ON:
-    read_parquet = parquet.read_parquet
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = parquet.read_parquet
 to_parquet = _deprecated_api(
     "dask_cudf.io.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index b22b31a591f..e36ee04d827 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -1,8 +1,193 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+import os
+from glob import glob
+from warnings import warn
 
-read_csv = _deprecated_api(
-    "dask_cudf.io.csv.read_csv",
-    new_api="dask_cudf.read_csv",
-)
+from fsspec.utils import infer_compression
+
+from dask import dataframe as dd
+from dask.dataframe.io.csv import make_reader
+from dask.utils import parse_bytes
+
+import cudf
+
+
+def read_csv(path, blocksize="default", **kwargs):
+    """
+    Read CSV files into a :class:`.DataFrame`.
+
+    This API parallelizes the :func:`cudf:cudf.read_csv` function in
+    the following ways:
+
+    It supports loading many files at once using globstrings:
+
+    >>> import dask_cudf
+    >>> df = dask_cudf.read_csv("myfiles.*.csv")
+
+    It can break up large files if blocksize is specified:
+
+    >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
+
+    It can read CSV files from external resources (e.g. S3, HTTP, FTP):
+
+    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
+    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
+
+    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
+    supports many of the same keyword arguments with the same
+    performance guarantees. See the docstring for
+    :func:`cudf:cudf.read_csv` for more information on available
+    keyword arguments.
+
+    Parameters
+    ----------
+    path : str, path object, or file-like object
+        Either a path to a file (a str, :py:class:`pathlib.Path`, or
+        ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3
+        locations), or any object with a ``read()`` method (such as
+        builtin :py:func:`open` file handler function or
+        :py:class:`~io.StringIO`).
+    blocksize : int or str, default "256 MiB"
+        The target task partition size. If ``None``, a single block
+        is used for each file.
+    **kwargs : dict
+        Passthrough keyword arguments that are sent to
+        :func:`cudf:cudf.read_csv`.
+
+    Notes
+    -----
+    If any of `skipfooter`/`skiprows`/`nrows` are passed,
+    `blocksize` will default to None.
+
+    Examples
+    --------
+    >>> import dask_cudf
+    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
+    >>> ddf.compute()
+       a      b
+    0  1     hi
+    1  2  hello
+    2  3     ai
+
+    """
+    # Set default `blocksize`
+    if blocksize == "default":
+        if (
+            kwargs.get("skipfooter", 0) != 0
+            or kwargs.get("skiprows", 0) != 0
+            or kwargs.get("nrows", None) is not None
+        ):
+            # Cannot read in blocks if skipfooter,
+            # skiprows or nrows is passed.
+            blocksize = None
+        else:
+            blocksize = "256 MiB"
+
+    if "://" in str(path):
+        func = make_reader(cudf.read_csv, "read_csv", "CSV")
+        return func(path, blocksize=blocksize, **kwargs)
+    else:
+        return _internal_read_csv(path=path, blocksize=blocksize, **kwargs)
+
+
+def _internal_read_csv(path, blocksize="256 MiB", **kwargs):
+    if isinstance(blocksize, str):
+        blocksize = parse_bytes(blocksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    compression = kwargs.get("compression", "infer")
+
+    if compression == "infer":
+        # Infer compression from first path by default
+        compression = infer_compression(filenames[0])
+
+    if compression and blocksize:
+        # compressed CSVs reading must read the entire file
+        kwargs.pop("byte_range", None)
+        warn(
+            "Warning %s compression does not support breaking apart files\n"
+            "Please ensure that each individual file can fit in memory and\n"
+            "use the keyword ``blocksize=None to remove this message``\n"
+            "Setting ``blocksize=(size of file)``" % compression
+        )
+        blocksize = None
+
+    if blocksize is None:
+        return read_csv_without_blocksize(path, **kwargs)
+
+    # Let dask.dataframe generate meta
+    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
+    kwargs1 = kwargs.copy()
+    usecols = kwargs1.pop("usecols", None)
+    dtype = kwargs1.pop("dtype", None)
+    meta = dask_reader(filenames[0], **kwargs1)._meta
+    names = meta.columns
+    if usecols or dtype:
+        # Regenerate meta with original kwargs if
+        # `usecols` or `dtype` was specified
+        meta = dask_reader(filenames[0], **kwargs)._meta
+
+    i = 0
+    path_list = []
+    kwargs_list = []
+    for fn in filenames:
+        size = os.path.getsize(fn)
+        for start in range(0, size, blocksize):
+            kwargs2 = kwargs.copy()
+            kwargs2["byte_range"] = (
+                start,
+                blocksize,
+            )  # specify which chunk of the file we care about
+            if start != 0:
+                kwargs2["names"] = names  # no header in the middle of the file
+                kwargs2["header"] = None
+            path_list.append(fn)
+            kwargs_list.append(kwargs2)
+            i += 1
+
+    return dd.from_map(_read_csv, path_list, kwargs_list, meta=meta)
+
+
+def _read_csv(fn, kwargs):
+    return cudf.read_csv(fn, **kwargs)
+
+
+def read_csv_without_blocksize(path, **kwargs):
+    """Read entire CSV with optional compression (gzip/zip)
+
+    Parameters
+    ----------
+    path : str
+        path to files (support for glob)
+    """
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    meta_kwargs = kwargs.copy()
+    if "skipfooter" in meta_kwargs:
+        meta_kwargs.pop("skipfooter")
+    if "nrows" in meta_kwargs:
+        meta_kwargs.pop("nrows")
+    # Read "head" of first file (first 5 rows).
+    # Convert to empty df for metadata.
+    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
+    return dd.from_map(cudf.read_csv, filenames, meta=meta, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 8f85ea54c0a..3022ebb2a5b 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,8 +1,209 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+from functools import partial
 
-read_json = _deprecated_api(
-    "dask_cudf.io.json.read_json",
-    new_api="dask_cudf.read_json",
-)
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
+import dask
+from dask.utils import parse_bytes
+
+import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
+
+from dask_cudf.backends import _default_backend
+
+
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
+    """Read JSON data into a :class:`.DataFrame`.
+
+    This function wraps :func:`dask.dataframe.read_json`, and passes
+    ``engine=partial(cudf.read_json, engine="auto")`` by default.
+
+    Parameters
+    ----------
+    url_path : str, list of str
+        Location to read from. If a string, can include a glob character to
+        find a set of file names.
+        Supports protocol specifications such as ``"s3://"``.
+    engine : str or Callable, default "auto"
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~collections.abc.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
+    **kwargs :
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
+
+    Returns
+    -------
+    :class:`.DataFrame`
+
+    Examples
+    --------
+    Load single file
+
+    >>> from dask_cudf import read_json
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    Load large line-delimited JSON files using partitions of approx
+    256MB size
+
+    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
+
+    Load nested JSON data
+
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    See Also
+    --------
+    dask.dataframe.read_json
+
+    """
+
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
+    return _default_backend(
+        dask.dataframe.read_json,
+        url_path,
+        engine=(
+            partial(cudf.read_json, engine=engine)
+            if isinstance(engine, str)
+            else engine
+        ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 5219cdacc31..5de28751912 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,13 +1,195 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from dask_cudf import _deprecated_api
-
-read_orc = _deprecated_api(
-    "dask_cudf.io.orc.read_orc",
-    new_api="dask_cudf.read_orc",
-)
-to_orc = _deprecated_api(
-    "dask_cudf.io.orc.to_orc",
-    new_api="dask_cudf._legacy.io.orc.to_orc",
-    rec="Please use the DataFrame.to_orc method instead.",
-)
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+
+from io import BufferedWriter, IOBase
+
+from fsspec.core import get_fs_token_paths
+from fsspec.utils import stringify_path
+from pyarrow import orc as orc
+
+from dask import dataframe as dd
+from dask.dataframe.io.utils import _get_pyarrow_dtypes
+
+import cudf
+
+
+def _read_orc_stripe(source, fs, columns=None, kwargs=None):
+    """Pull out specific columns from specific stripe"""
+    path, stripe = source
+    if kwargs is None:
+        kwargs = {}
+    with fs.open(path, "rb") as f:
+        df_stripe = cudf.read_orc(
+            f, stripes=[stripe], columns=columns, **kwargs
+        )
+    return df_stripe
+
+
+def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
+    """Read ORC files into a :class:`.DataFrame`.
+
+    Note that this function is mostly borrowed from upstream Dask.
+
+    Parameters
+    ----------
+    path : str or list[str]
+        Location of file(s), which can be a full URL with protocol specifier,
+        and may include glob character if a single string.
+    columns : None or list[str]
+        Columns to load. If None, loads all.
+    filters : None or list of tuple or list of lists of tuples
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+
+    See Also
+    --------
+    dask.dataframe.read_orc
+
+    Returns
+    -------
+    dask_cudf.DataFrame
+
+    """
+
+    storage_options = storage_options or {}
+    fs, _, paths = get_fs_token_paths(
+        path, mode="rb", storage_options=storage_options
+    )
+    schema = None
+    nstripes_per_file = []
+    for path in paths:
+        with fs.open(path, "rb") as f:
+            o = orc.ORCFile(f)
+            if schema is None:
+                schema = o.schema
+            elif schema != o.schema:
+                raise ValueError(
+                    "Incompatible schemas while parsing ORC files"
+                )
+            nstripes_per_file.append(o.nstripes)
+    schema = _get_pyarrow_dtypes(schema, categories=None)
+    if columns is not None:
+        ex = set(columns) - set(schema)
+        if ex:
+            raise ValueError(
+                f"Requested columns ({ex}) not in schema ({set(schema)})"
+            )
+    else:
+        columns = list(schema)
+
+    with fs.open(paths[0], "rb") as f:
+        meta = cudf.read_orc(
+            f,
+            stripes=[0] if nstripes_per_file[0] else None,
+            columns=columns,
+            **kwargs,
+        )
+
+    sources = []
+    for path, n in zip(paths, nstripes_per_file):
+        for stripe in (
+            range(n)
+            if filters is None
+            else cudf.io.orc._filter_stripes(filters, path)
+        ):
+            sources.append((path, stripe))
+
+    return dd.from_map(
+        _read_orc_stripe,
+        sources,
+        args=[fs],
+        columns=columns,
+        kwargs=kwargs,
+        meta=meta,
+    )
+
+
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
+    full_path = fs.sep.join([path, filename])
+    with fs.open(full_path, mode="wb") as out_file:
+        if not isinstance(out_file, IOBase):
+            out_file = BufferedWriter(out_file)
+        cudf.io.to_orc(df, out_file, compression=compression)
+    return full_path
+
+
+def to_orc(
+    df,
+    path,
+    write_index=True,
+    storage_options=None,
+    compression="snappy",
+    compute=True,
+    **kwargs,
+):
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str or pathlib.Path
+        Destination directory for data.  Prepend with protocol like ``s3://``
+        or ``hdfs://`` for remote data.
+    write_index : boolean, optional
+        Whether or not to write the index. Defaults to True.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+    compression : string or dict, optional
+    compute : bool, optional
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
+    """
+
+    from dask import compute as dask_compute, delayed
+
+    # TODO: Use upstream dask implementation once available
+    #       (see: Dask Issue#5596)
+
+    if hasattr(path, "name"):
+        path = stringify_path(path)
+    fs, _, _ = get_fs_token_paths(
+        path, mode="wb", storage_options=storage_options
+    )
+    # Trim any protocol information from the path before forwarding
+    path = fs._strip_protocol(path)
+
+    if write_index:
+        df = df.reset_index()
+    else:
+        # Not writing index - might as well drop it
+        df = df.reset_index(drop=True)
+
+    fs.mkdirs(path, exist_ok=True)
+
+    # Use i_offset and df.npartitions to define file-name list
+    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
+
+    # write parts
+    dwrite = delayed(write_orc_partition)
+    parts = [
+        dwrite(d, path, fs, filename, compression=compression)
+        for d, filename in zip(df.to_delayed(), filenames)
+    ]
+
+    if compute:
+        return dask_compute(*parts)
+
+    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ba6209c4820..a953dce787d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -37,10 +37,9 @@ def TaskList(*x):
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
-
 # Dask-expr imports CudfEngine from this module
 from dask_cudf._legacy.io.parquet import CudfEngine
+from dask_cudf.core import _deprecated_api
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -832,15 +831,8 @@ def read_parquet_expr(
     )
 
 
-if QUERY_PLANNING_ON:
-    read_parquet = read_parquet_expr
-    read_parquet.__doc__ = read_parquet_expr.__doc__
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.parquet.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = read_parquet_expr
+read_parquet.__doc__ = read_parquet_expr.__doc__
 to_parquet = _deprecated_api(
     "dask_cudf.io.parquet.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index a0acb86f5a9..ddfd1c1adac 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -185,11 +185,6 @@ def test_read_csv_blocksize_none(tmp_path, compression, size):
     df2 = dask_cudf.read_csv(path, blocksize=None, dtype=typ)
     dd.assert_eq(df, df2)
 
-    # Test chunksize deprecation
-    with pytest.warns(FutureWarning, match="deprecated"):
-        df3 = dask_cudf.read_csv(path, chunksize=None, dtype=typ)
-    dd.assert_eq(df, df3)
-
 
 @pytest.mark.parametrize("dtype", [{"b": str, "c": int}, None])
 def test_csv_reader_usecols(tmp_path, dtype):
@@ -275,7 +270,3 @@ def test_deprecated_api_paths(tmp_path):
     with pytest.warns(match="dask_cudf.io.read_csv is now deprecated"):
         df2 = dask_cudf.io.read_csv(csv_path)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.csv.read_csv is now deprecated"):
-        df2 = dask_cudf.io.csv.read_csv(csv_path)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index f5509cf91c3..48eca13e16f 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import math
 import os
@@ -11,10 +11,6 @@
 from dask.utils import tmpfile
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
@@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path):
     with pytest.warns(match="dask_cudf.io.read_json is now deprecated"):
         df2 = dask_cudf.io.read_json(path)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"):
-        df2 = dask_cudf.io.json.read_json(path)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index b6064d851ca..4aac463420b 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -12,10 +12,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
@@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"):
         df2 = dask_cudf.io.read_orc(paths)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"):
-        df2 = dask_cudf.io.orc.read_orc(paths)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6efe6c4f388..9f7031f4d2a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import glob
 import math
@@ -16,11 +16,6 @@
 
 import dask_cudf
 from dask_cudf._legacy.io.parquet import create_metadata_file
-from dask_cudf.tests.utils import (
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
@@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir):
     dd.assert_eq(df, new_ddf)
 
 
-@skip_dask_expr("Not necessary in dask-expr")
-def test_check_file_size(tmpdir):
-    # Test simple file-size check to help warn users
-    # of upstream change to `split_row_groups` default
-    fn = str(tmpdir.join("test.parquet"))
-    cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn)
-    with pytest.warns(match="large parquet file"):
-        # Need to use `dask_cudf._legacy.io` path
-        # TODO: Remove outdated `check_file_size` functionality
-        dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute()
-
-
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
@@ -626,7 +607,6 @@ def test_timezone_column(tmpdir):
     dd.assert_eq(got, expect)
 
 
-@require_dask_expr()
 @pytest.mark.skipif(
     not dask_cudf.backends.PYARROW_GE_15,
     reason="Requires pyarrow 15",
@@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"):
         dask_cudf.io.to_parquet(df, tmpdir)
 
-    if dask_cudf.QUERY_PLANNING_ON:
-        df2 = dask_cudf.io.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-
-        df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-    else:
-        with pytest.warns(match="legacy dask_cudf.io.read_parquet"):
-            df2 = dask_cudf.io.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
 
-        with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"):
-            df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 90907f6fb99..7c53b89a883 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import os
 import socket
@@ -14,7 +14,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises():
         pytest.param(
             "arrow",
             marks=pytest.mark.skipif(
-                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                not dask_cudf.backends.PYARROW_GE_15,
                 reason="Not supported",
             ),
         ),
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e35b6411a9d..f4d59334e03 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import os
 
@@ -9,10 +9,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
@@ -42,7 +38,3 @@ def test_deprecated_api_paths():
     with pytest.warns(match="dask_cudf.io.read_text is now deprecated"):
         df2 = dask_cudf.io.read_text(text_file, delimiter=".")
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"):
-        df2 = dask_cudf.io.text.read_text(text_file, delimiter=".")
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
index 1caf4e81d8e..eb1d007cc16 100644
--- a/python/dask_cudf/dask_cudf/io/text.py
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -1,8 +1,56 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+import os
+from glob import glob
 
-read_text = _deprecated_api(
-    "dask_cudf.io.text.read_text",
-    new_api="dask_cudf.read_text",
-)
+import dask.dataframe as dd
+from dask.utils import parse_bytes
+
+import cudf
+
+
+def _read_text(source, **kwargs):
+    # Wrapper for cudf.read_text operation
+    fn, byte_range = source
+    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
+
+
+def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
+    if isinstance(chunksize, str):
+        chunksize = parse_bytes(chunksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    if chunksize and byte_range:
+        raise ValueError("Cannot specify both chunksize and byte_range.")
+
+    if chunksize:
+        sources = []
+        for fn in filenames:
+            size = os.path.getsize(fn)
+            for start in range(0, size, chunksize):
+                byte_range = (
+                    start,
+                    chunksize,
+                )  # specify which chunk of the file we care about
+                sources.append((fn, byte_range))
+    else:
+        sources = [(fn, byte_range) for fn in filenames]
+
+    return dd.from_map(
+        _read_text,
+        sources,
+        meta=cudf.Series([], dtype="O"),
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 3fbb2aacd2c..c6b01a648eb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,7 +13,6 @@
 from cudf.testing._utils import does_not_raise
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 #############################################################################
 #                        Datetime Accessor                                  #
@@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index cda7e2d134d..31957a106ff 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import random
 
@@ -9,18 +9,12 @@
 
 import dask
 from dask import dataframe as dd
-from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty
+from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty
 from dask.utils import M
 
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 rng = np.random.default_rng(seed=0)
 
@@ -299,37 +293,6 @@ def test_set_index_sorted():
             gddf1.set_index("val", sorted=True)
 
 
-@pytest.mark.parametrize("nelem", [10, 200, 1333])
-@pytest.mark.parametrize("index", [None, "myindex"])
-def test_rearrange_by_divisions(nelem, index):
-    with dask.config.set(scheduler="single-threaded"):
-        rng = np.random.default_rng(seed=0)
-        df = pd.DataFrame(
-            {
-                "x": rng.integers(0, 20, size=nelem),
-                "y": rng.normal(size=nelem),
-                "z": rng.choice(["dog", "cat", "bird"], nelem),
-            }
-        )
-        df["z"] = df["z"].astype("category")
-
-        ddf1 = dd.from_pandas(df, npartitions=4)
-        gdf1 = dask_cudf.from_cudf(
-            cudf.DataFrame.from_pandas(df), npartitions=4
-        )
-        ddf1.index.name = index
-        gdf1.index.name = index
-        divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20)
-
-        expect = dd.shuffle.rearrange_by_divisions(
-            ddf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        result = dd.shuffle.rearrange_by_divisions(
-            gdf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        dd.assert_eq(expect, result)
-
-
 def test_assign():
     rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
@@ -393,44 +356,6 @@ def test_setitem_scalar_datetime():
     np.testing.assert_array_equal(got["z"], df["z"])
 
 
-@skip_dask_expr("Not relevant for dask-expr")
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda: pd.DataFrame(
-            {"A": rng.random(10), "B": rng.random(10)},
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.DataFrame(
-            {
-                "A": rng.random(10),
-                "B": list("a" * 10),
-                "C": pd.Series(
-                    [str(20090101 + i) for i in range(10)],
-                    dtype="datetime64[ns]",
-                ),
-            },
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.Series(list("abcdefghijklmnop")),
-        lambda: pd.Series(
-            rng.random(10),
-            index=pd.Index(
-                [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]"
-            ),
-        ),
-    ],
-)
-def test_repr(func):
-    pdf = func()
-    gdf = cudf.from_pandas(pdf)
-    gddf = dd.from_pandas(gdf, npartitions=3, sort=False)
-
-    assert repr(gddf)
-    if hasattr(pdf, "_repr_html_"):
-        assert gddf._repr_html_()
-
-
 @pytest.mark.skip(reason="datetime indexes not fully supported in cudf")
 @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"])
 @pytest.mark.parametrize("stop", ["1d", "3d", "8h"])
@@ -657,20 +582,20 @@ def test_hash_object_dispatch(index):
     )
 
     # DataFrame
-    result = dd.core.hash_object_dispatch(obj, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # Series
-    result = dd.core.hash_object_dispatch(obj["x"], index=index)
+    result = dd.dispatch.hash_object_dispatch(obj["x"], index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # DataFrame with MultiIndex
     obj_multi = obj.set_index(["x", "z"], drop=True)
-    result = dd.core.hash_object_dispatch(obj_multi, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj_multi, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
@@ -784,7 +709,6 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -800,7 +724,6 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -814,7 +737,6 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
@@ -864,7 +786,7 @@ def test_merging_categorical_columns():
 
     ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2)
 
-    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+    ddf_1 = ddf_1.categorize(columns=["cat_col"])
 
     df_2 = cudf.DataFrame(
         {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
@@ -872,7 +794,7 @@ def test_merging_categorical_columns():
 
     ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2)
 
-    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    ddf_2 = ddf_2.categorize(columns=["cat_col"])
 
     expected = cudf.DataFrame(
         {
@@ -932,14 +854,9 @@ def func(x):
 
     result = ds.map_partitions(func, meta=s.values)
 
-    if QUERY_PLANNING_ON:
-        # Check Array and round-tripped DataFrame
-        dask.array.assert_eq(result, func(s))
-        dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
-    else:
-        # Legacy version still carries numpy metadata
-        # See: https://github.com/dask/dask/issues/11017
-        dask.array.assert_eq(result.compute(), func(s))
+    # Check Array and round-tripped DataFrame
+    dask.array.assert_eq(result, func(s))
+    dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
 
 
 def test_implicit_array_conversion_cupy_sparse():
@@ -981,7 +898,6 @@ def test_series_isin_error():
         ddf.isin([1, 5, "a"]).compute()
 
 
-@require_dask_expr()
 def test_to_backend_simplify():
     # Check that column projection is not blocked by to_backend
     with dask.config.set({"dataframe.backend": "pandas"}):
@@ -1019,3 +935,29 @@ def test_rename_axis_after_join():
     result = ddf1.join(ddf2, how="outer")
     expected = df1.join(df2, how="outer")
     dd.assert_eq(result, expected, check_index=False)
+
+
+def test_clip_dataframe():
+    df = cudf.DataFrame(
+        {
+            "id": ["a", "b", "c", "d"],
+            "score": [-1, 1, 4, 6],
+        }
+    )
+    expect = df.clip(lower=["b", 1], upper=["d", 5], axis=1)
+    got = dd.from_pandas(df, npartitions=2).clip(
+        lower=["b", 1], upper=["d", 5], axis=1
+    )
+    dd.assert_eq(expect, got)
+
+
+def test_clip_series():
+    ser = cudf.Series([-0.5, 0.5, 4.5, 5.5])
+    expect = ser.clip(lower=0, upper=5).round().astype(int)
+    got = (
+        dd.from_pandas(ser, npartitions=2)
+        .clip(lower=0, upper=5)
+        .round()
+        .astype(int)
+    )
+    dd.assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index d03180852eb..c28b7e49207 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -4,7 +4,7 @@
 import pytest
 
 import dask
-from dask import dataframe as dd
+from dask import array as da, dataframe as dd
 from dask.distributed import Client
 from distributed.utils_test import cleanup, loop, loop_in_thread  # noqa: F401
 
@@ -121,3 +121,17 @@ def test_unique():
                 ddf.x.unique().compute(),
                 check_index=False,
             )
+
+
+def test_serialization_of_numpy_types():
+    # Dask uses numpy integers as column names, which can break cudf serialization
+    with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
+        with Client(cluster):
+            with dask.config.set(
+                {"dataframe.backend": "cudf", "array.backend": "cupy"}
+            ):
+                rng = da.random.default_rng()
+                X_arr = rng.random((100, 10), chunks=(50, 10))
+                X = dd.from_dask_array(X_arr)
+                X = X[X.columns[0]]
+                X.compute()
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 9bd3b506db0..11ca0c6a783 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,12 +13,7 @@
 from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
-from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    xfail_dask_expr,
-)
+from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf):
         expect = getattr(gdf_grouped, aggregation)()
         actual = getattr(ddf_grouped, aggregation)()
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
     dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
     if not series:
         expect = gdf_grouped.agg({"x": aggregation})
         actual = ddf_grouped.agg({"x": aggregation})
 
-        if not QUERY_PLANNING_ON:
-            assert_cudf_groupby_layers(actual)
-
         dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
 
@@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
-        # groupby.agg should add an explicit getitem layer
-        # to improve/enable column projection
-        assert hlg_layer(actual.dask, "getitem")
-
     dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)
 
 
@@ -556,20 +538,13 @@ def test_groupby_categorical_key():
         True,
         pytest.param(
             False,
-            marks=xfail_dask_expr("as_index not supported in dask-expr"),
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    "fused",
-    [
-        True,
-        pytest.param(
-            False,
-            marks=require_dask_expr("Not supported by legacy API"),
+            marks=pytest.mark.xfail(
+                reason="as_index not supported in dask-expr"
+            ),
         ),
     ],
 )
+@pytest.mark.parametrize("fused", [True, False])
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -590,19 +565,16 @@ def test_groupby_agg_params(
         "c": ["mean", "std", "var"],
     }
 
-    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
+    fused_kwarg = {"fused": fused}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
     # Avoid using as_index when query-planning is enabled
-    if QUERY_PLANNING_ON:
-        with pytest.warns(FutureWarning, match="argument is now deprecated"):
-            # Should warn when `as_index` is used
-            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
-        maybe_as_index = {"as_index": as_index} if as_index is False else {}
-    else:
-        maybe_as_index = {"as_index": as_index}
+    with pytest.warns(FutureWarning, match="argument is now deprecated"):
+        # Should warn when `as_index` is used
+        ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+    maybe_as_index = {"as_index": as_index} if as_index is False else {}
 
     # Check `sort=True` behavior
     if split_out == 1:
@@ -671,7 +643,6 @@ def test_groupby_agg_params(
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -711,7 +682,6 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg):
     )
 
 
-@xfail_dask_expr("Co-alignment check fails in dask-expr")
+@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr")
 def test_groupby_with_list_of_series():
     df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]})
     gdf = dask_cudf.from_cudf(df, npartitions=2)
@@ -773,7 +743,6 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
@@ -833,7 +802,7 @@ def test_groupby_all_columns(func):
     expect = func(ddf)
     actual = func(gddf)
 
-    dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON)
+    dd.assert_eq(expect, actual, check_names=False)
 
 
 def test_groupby_shuffle():
@@ -870,15 +839,3 @@ def test_groupby_shuffle():
     # NOTE: `shuffle_method=True` should be default
     got = gddf.groupby("a", sort=False).agg(spec, split_out=2)
     dd.assert_eq(expect, got.compute().sort_index())
-
-    if not QUERY_PLANNING_ON:
-        # Sorted aggregation fails with split_out>1 when shuffle is False
-        # (sort=True, split_out=2, shuffle_method=False)
-        with pytest.raises(ValueError):
-            gddf.groupby("a", sort=True).agg(
-                spec, shuffle_method=False, split_out=2
-            )
-
-        # Check shuffle kwarg deprecation
-        with pytest.warns(match="'shuffle' keyword is deprecated"):
-            gddf.groupby("a", sort=True).agg(spec, shuffle=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 0b7c7855e07..2d05345bc4a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -8,12 +8,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
-
-# No dask-expr support
-pytestmark = xfail_dask_expr(
-    "Newer dask version needed", lt_version="2024.5.0"
-)
 
 
 def test_get_dummies_cat():
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 02c815427f3..68d6e72660e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -10,7 +10,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -67,7 +66,6 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index b44b3f939e7..ef6765f39d1 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -1,22 +1,12 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
-import pytest
-from packaging.version import Version
 
-import dask
 import dask.dataframe as dd
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON
-
-if QUERY_PLANNING_ON:
-    DASK_VERSION = Version(dask.__version__)
-else:
-    DASK_VERSION = None
-
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
     rng = np.random.default_rng(seed=0)
@@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
     gdf = cudf.DataFrame.from_pandas(df)
     dgf = dd.from_pandas(gdf, npartitions=npartitions)
     return df, dgf
-
-
-_default_reason = "Not compatible with dask-expr"
-
-
-def skip_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        skip = QUERY_PLANNING_ON
-    return pytest.mark.skipif(skip, reason=reason)
-
-
-def xfail_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        xfail = QUERY_PLANNING_ON
-    return pytest.mark.xfail(xfail, reason=reason)
-
-
-def require_dask_expr(reason="requires dask-expr"):
-    return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 33ba8fe083f..5b8b98c2b55 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -24,7 +24,7 @@ dependencies = [
     "fsspec>=0.6.0",
     "numpy>=1.23,<3.0a0",
     "pandas>=2.0,<2.2.4dev0",
-    "pynvml>=11.4.1,<12.0.0a0",
+    "pynvml>=12.0.0,<13.0.0a0",
     "rapids-dask-dependency==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -39,15 +39,15 @@ classifiers = [
 ]
 
 [project.entry-points."dask.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfBackendEntrypoint"
+cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint"
 
 [project.entry-points."dask_expr.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
+cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
     "dask-cuda==25.2.*,>=0.0.0a0",
-    "numba-cuda>=0.0.13,<0.0.18",
+    "numba-cuda>=0.2.0,<0.3.0",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
@@ -96,16 +96,11 @@ empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
-    # https://github.com/rapidsai/build-planning/issues/116
-    "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
     # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
     "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning:botocore",
     "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
-    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
-    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
-    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 5f9a04d3cee..259492b98d1 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -34,9 +34,6 @@ endif()
 
 unset(cudf_FOUND)
 
-# Find Python early so that later commands can use it
-find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
-
 set(BUILD_TESTS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
index 2d070ddda69..fbd478f963f 100644
--- a/python/pylibcudf/pylibcudf/hashing.pxd
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128(
     uint64_t seed=*
 )
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=*
+)
 
 cpdef Column xxhash_64(
     Table input,
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
index a849f5d0729..d535d842a18 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyi
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int]
 
 def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ...
 def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ...
+def xxhash_32(input: Table, seed: int = ...) -> Column: ...
 def xxhash_64(input: Table, seed: int = ...) -> Column: ...
 def md5(input: Table) -> Column: ...
 def sha1(input: Table) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index 548cffc0ce8..1f093b20c6b 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport (
     sha256 as cpp_sha256,
     sha384 as cpp_sha384,
     sha512 as cpp_sha512,
+    xxhash_32 as cpp_xxhash_32,
     xxhash_64 as cpp_xxhash_64,
 )
 from pylibcudf.libcudf.table.table cimport table
@@ -30,6 +31,7 @@ __all__ = [
     "sha256",
     "sha384",
     "sha512",
+    "xxhash_32",
     "xxhash_64",
 ]
 
@@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128(
     return Table.from_libcudf(move(c_result))
 
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the xxHash 32-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`xxhash_32`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint32_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+
+    cdef unique_ptr[column] c_result
+    with  nogil:
+        c_result = cpp_xxhash_32(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
 cpdef Column xxhash_64(
     Table input,
     uint64_t seed=DEFAULT_HASH_SEED
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index bd5397ac328..7a102cf0c88 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -273,10 +273,19 @@ cdef void _release_array(object array_capsule) noexcept:
     free(array)
 
 
+def _maybe_create_nested_column_metadata(Column col):
+    return ColumnMetadata(
+        children_meta=[
+            _maybe_create_nested_column_metadata(child) for child in col.children()
+        ]
+    )
+
+
 def _table_to_schema(Table tbl, metadata):
     if metadata is None:
-        metadata = [ColumnMetadata() for _ in range(len(tbl.columns()))]
-    metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata]
+        metadata = [_maybe_create_nested_column_metadata(col) for col in tbl.columns()]
+    else:
+        metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata]
 
     cdef vector[column_metadata] c_metadata
     c_metadata.reserve(len(metadata))
diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd
index 8696fcb3c15..a0fca95d459 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/io/avro.pxd
@@ -1,12 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from pylibcudf.libcudf.io.avro cimport avro_reader_options
+from pylibcudf.libcudf.io.avro cimport avro_reader_options, avro_reader_options_builder
 from pylibcudf.libcudf.types cimport size_type
 
 
-cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = *,
-    size_type skip_rows = *,
-    size_type num_rows = *
-)
+from pylibcudf.libcudf.types cimport size_type
+
+cdef class AvroReaderOptions:
+    cdef avro_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_columns(self, list col_names)
+
+
+cdef class AvroReaderOptionsBuilder:
+    cdef avro_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef AvroReaderOptionsBuilder columns(self, list col_names)
+    cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows)
+    cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows)
+    cpdef AvroReaderOptions build(self)
+
+cpdef TableWithMetadata read_avro(AvroReaderOptions options)
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi
index 49c2f083702..8cafc9a6573 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyi
+++ b/python/pylibcudf/pylibcudf/io/avro.pyi
@@ -1,11 +1,16 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from pylibcudf.io.types import SourceInfo, TableWithMetadata
 
-__all__ = ["read_avro"]
-
-def read_avro(
-    source_info: SourceInfo,
-    columns: list[str] | None = None,
-    skip_rows: int = 0,
-    num_rows: int = -1,
-) -> TableWithMetadata: ...
+__all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
+
+class AvroReaderOptions:
+    @staticmethod
+    def builder(source: SourceInfo) -> AvroReaderOptionsBuilder: ...
+
+class AvroReaderOptionsBuilder:
+    def columns(col_names: list[str]) -> AvroReaderOptionsBuilder: ...
+    def skip_rows(skip_rows: int) -> AvroReaderOptionsBuilder: ...
+    def num_rows(num_rows: int) -> AvroReaderOptionsBuilder: ...
+    def build(self) -> AvroReaderOptions: ...
+
+def read_avro(options: AvroReaderOptions) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 4271333511a..c378fca0415 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -10,52 +10,138 @@ from pylibcudf.libcudf.io.avro cimport (
 )
 from pylibcudf.libcudf.types cimport size_type
 
-__all__ = ["read_avro"]
+__all__ = ["read_avro", "AvroReaderOptions", "AvroReaderOptionsBuilder"]
+
+
+cdef class AvroReaderOptions:
+    """
+    The settings to use for ``read_avro``
+    For details, see :cpp:class:`cudf::io::avro_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a AvroWriterOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::avro_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the Avro file from.
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+            Builder to build AvroReaderOptions
+        """
+        cdef AvroReaderOptionsBuilder avro_builder = AvroReaderOptionsBuilder.__new__(
+            AvroReaderOptionsBuilder
+        )
+        avro_builder.c_obj = avro_reader_options.builder(source.c_obj)
+        avro_builder.source = source
+        return avro_builder
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Set names of the column to be read.
+
+        Parameters
+        ----------
+        col_names : list[str]
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] vec
+        vec.reserve(len(col_names))
+        for name in col_names:
+            vec.push_back(str(name).encode())
+        self.c_obj.set_columns(vec)
+
+
+cdef class AvroReaderOptionsBuilder:
+    cpdef AvroReaderOptionsBuilder columns(self, list col_names):
+        """
+        Set names of the column to be read.
+
+        Parameters
+        ----------
+        col_names : list
+            List of column names
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        cdef vector[string] vec
+        vec.reserve(len(col_names))
+        for name in col_names:
+            vec.push_back(str(name).encode())
+        self.c_obj.columns(vec)
+        return self
+
+    cpdef AvroReaderOptionsBuilder skip_rows(self, size_type skip_rows):
+        """
+        Sets number of rows to skip.
+
+        Parameters
+        ----------
+        skip_rows : size_type
+            Number of rows to skip from start
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        self.c_obj.skip_rows(skip_rows)
+        return self
+
+    cpdef AvroReaderOptionsBuilder num_rows(self, size_type num_rows):
+        """
+        Sets number of rows to read.
+
+        Parameters
+        ----------
+        num_rows : size_type
+            Number of rows to read after skip
+
+        Returns
+        -------
+        AvroReaderOptionsBuilder
+        """
+        self.c_obj.num_rows(num_rows)
+        return self
+
+    cpdef AvroReaderOptions build(self):
+        """Create a AvroReaderOptions object"""
+        cdef AvroReaderOptions avro_options = AvroReaderOptions.__new__(
+            AvroReaderOptions
+        )
+        avro_options.c_obj = move(self.c_obj.build())
+        avro_options.source = self.source
+        return avro_options
 
 
 cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = None,
-    size_type skip_rows = 0,
-    size_type num_rows = -1
+    AvroReaderOptions options
 ):
     """
-    Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
+    Read from Avro format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
 
     For details, see :cpp:func:`read_avro`.
 
     Parameters
     ----------
-    source_info: SourceInfo
-        The SourceInfo object to read the avro dataset from.
-    columns: list, default None
-        Optional columns to read, if not provided, reads all columns in the file.
-    skip_rows: size_type, default 0
-        The number of rows to skip.
-    num_rows: size_type, default -1
-        The number of rows to read, after skipping rows.
-        If -1 is passed, all rows will be read.
-
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    options: AvroReaderOptions
+        Settings for controlling reading behavior
     """
-    cdef vector[string] c_columns
-    if columns is not None and len(columns) > 0:
-        c_columns.reserve(len(columns))
-        for col in columns:
-            c_columns.push_back(str(col).encode())
-
-    cdef avro_reader_options avro_opts = (
-        avro_reader_options.builder(source_info.c_obj)
-        .columns(c_columns)
-        .skip_rows(skip_rows)
-        .num_rows(num_rows)
-        .build()
-    )
-
     with nogil:
-        c_result = move(cpp_read_avro(avro_opts))
+        c_result = move(cpp_read_avro(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
index f65c1034598..7ce3cb859a5 100644
--- a/python/pylibcudf/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -6,42 +6,78 @@ from pylibcudf.io.types cimport (
     TableWithMetadata,
     compression_type,
 )
-from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.io.json cimport (
+    json_recovery_mode_t,
+    json_reader_options,
+    json_reader_options_builder,
+    json_writer_options,
+    json_writer_options_builder,
+)
 from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.table cimport Table
 
 
-cpdef TableWithMetadata read_json(
-    SourceInfo source_info,
-    list dtypes = *,
-    compression_type compression = *,
-    bool lines = *,
-    size_t byte_range_offset = *,
-    size_t byte_range_size = *,
-    bool keep_quotes = *,
-    bool mixed_types_as_string = *,
-    bool prune_columns = *,
-    json_recovery_mode_t recovery_mode = *,
-)
+cdef class JsonReaderOptions:
+    cdef json_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_dtypes(self, list types)
+    cpdef void enable_keep_quotes(self, bool keep_quotes)
+    cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string)
+    cpdef void enable_prune_columns(self, bool prune_columns)
+    cpdef void set_byte_range_offset(self, size_t offset)
+    cpdef void set_byte_range_size(self, size_t size)
+    cpdef void enable_lines(self, bool val)
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+    cpdef void set_delimiter(self, str val)
+    cpdef void enable_dayfirst(self, bool val)
+    cpdef void enable_experimental(self, bool val)
+    cpdef void enable_normalize_single_quotes(self, bool val)
+    cpdef void enable_normalize_whitespace(self, bool val)
+    cpdef void set_strict_validation(self, bool val)
+    cpdef void allow_unquoted_control_chars(self, bool val)
+    cpdef void allow_numeric_leading_zeros(self, bool val)
+    cpdef void allow_nonnumeric_numbers(self, bool val)
+    cpdef void set_na_values(self, list vals)
 
+cdef class JsonReaderOptionsBuilder:
+    cdef json_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef JsonReaderOptionsBuilder compression(self, compression_type compression)
+    cpdef JsonReaderOptionsBuilder lines(self, bool val)
+    cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val)
+    cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset)
+    cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size)
+    cpdef JsonReaderOptionsBuilder recovery_mode(
+        self, json_recovery_mode_t recovery_mode
+    )
+    cpdef build(self)
 
-cpdef void write_json(
-    SinkInfo sink_info,
-    TableWithMetadata tbl,
-    str na_rep = *,
-    bool include_nulls = *,
-    bool lines = *,
-    size_type rows_per_chunk = *,
-    str true_value = *,
-    str false_value = *
-)
+cpdef TableWithMetadata read_json(JsonReaderOptions options)
+
+cdef class JsonWriterOptions:
+    cdef json_writer_options c_obj
+    cdef SinkInfo sink
+    cdef Table table
+    cpdef void set_rows_per_chunk(self, size_type val)
+    cpdef void set_true_value(self, str val)
+    cpdef void set_false_value(self, str val)
+    cpdef void set_compression(self, compression_type comptype)
+
+cdef class JsonWriterOptionsBuilder:
+    cdef json_writer_options_builder c_obj
+    cdef SinkInfo sink
+    cdef Table table
+    cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta)
+    cpdef JsonWriterOptionsBuilder na_rep(self, str val)
+    cpdef JsonWriterOptionsBuilder include_nulls(self, bool val)
+    cpdef JsonWriterOptionsBuilder lines(self, bool val)
+    cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype)
+    cpdef JsonWriterOptions build(self)
+
+cpdef void write_json(JsonWriterOptions options)
 
 cpdef tuple chunked_read_json(
-    SourceInfo source_info,
-    list dtypes = *,
-    compression_type compression = *,
-    bool keep_quotes = *,
-    bool mixed_types_as_string = *,
-    bool prune_columns = *,
-    json_recovery_mode_t recovery_mode = *,
+    JsonReaderOptions options,
     int chunk_size= *,
 )
diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi
index b2bc6a43700..db4546f138d 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyi
+++ b/python/pylibcudf/pylibcudf/io/json.pyi
@@ -2,6 +2,8 @@
 from collections.abc import Mapping
 from typing import TypeAlias
 
+from typing_extensions import Self
+
 from pylibcudf.column import Column
 from pylibcudf.io.types import (
     CompressionType,
@@ -10,41 +12,66 @@ from pylibcudf.io.types import (
     SourceInfo,
     TableWithMetadata,
 )
+from pylibcudf.table import Table
 from pylibcudf.types import DataType
 
 ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
 
 NameAndType: TypeAlias = tuple[str, DataType, list[NameAndType]]
 
-def read_json(
-    source_info: SourceInfo,
-    dtypes: list[NameAndType] | None = None,
-    compression: CompressionType = CompressionType.AUTO,
-    lines: bool = False,
-    byte_range_offset: int = 0,
-    byte_range_size: int = 0,
-    keep_quotes: bool = False,
-    mixed_types_as_string: bool = False,
-    prune_columns: bool = False,
-    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
-) -> TableWithMetadata: ...
-def write_json(
-    sink_info: SinkInfo,
-    table_w_meta: TableWithMetadata,
-    na_rep: str = "",
-    include_nulls: bool = False,
-    lines: bool = False,
-    rows_per_chunk: int = 2**32 - 1,
-    true_value: str = "true",
-    false_value: str = "false",
-) -> None: ...
+class JsonReaderOptions:
+    def set_dtypes(
+        self, types: list[DataType] | list[NameAndType]
+    ) -> None: ...
+    def enable_keep_quotes(self, keep_quotes: bool) -> None: ...
+    def enable_mixed_types_as_string(
+        self, mixed_types_as_string: bool
+    ) -> None: ...
+    def enable_prune_columns(self, prune_columns: bool) -> None: ...
+    def set_byte_range_offset(self, offset: int) -> None: ...
+    def set_byte_range_size(self, size: int) -> None: ...
+    def enable_lines(self, val: bool) -> None: ...
+    def set_delimiter(self, val: str) -> None: ...
+    def enable_dayfirst(self, val: bool) -> None: ...
+    def enable_experimental(self, val: bool) -> None: ...
+    def enable_normalize_single_quotes(self, val: bool) -> None: ...
+    def enable_normalize_whitespace(self, val: bool) -> None: ...
+    def set_strict_validation(self, val: bool) -> None: ...
+    def allow_unquoted_control_chars(self, val: bool) -> None: ...
+    def allow_numeric_leading_zeros(self, val: bool) -> None: ...
+    def allow_nonnumeric_numbers(self, val: bool) -> None: ...
+    def set_na_values(self, vals: list[str]) -> None: ...
+    @staticmethod
+    def builder(source: SourceInfo) -> JsonReaderOptionsBuilder: ...
+
+class JsonReaderOptionsBuilder:
+    def compression(self, compression: CompressionType) -> Self: ...
+    def lines(self, lines: bool) -> Self: ...
+    def byte_range_offset(self, byte_range_offset: int) -> Self: ...
+    def byte_range_size(self, byte_range_size: int) -> Self: ...
+    def recovery_mode(self, recovery_mode: JSONRecoveryMode) -> Self: ...
+    def build(self) -> JsonReaderOptions: ...
+
+def read_json(options: JsonReaderOptions) -> TableWithMetadata: ...
+
+class JsonWriterOptions:
+    @staticmethod
+    def builder(sink: SinkInfo, table: Table) -> JsonWriterOptionsBuilder: ...
+    def set_rows_per_chunk(self, val: int) -> None: ...
+    def set_true_value(self, val: str) -> None: ...
+    def set_false_value(self, val: str) -> None: ...
+    def set_compression(self, comptype: CompressionType) -> None: ...
+
+class JsonWriterOptionsBuilder:
+    def metadata(self, tbl_w_meta: TableWithMetadata) -> Self: ...
+    def na_rep(self, val: str) -> Self: ...
+    def include_nulls(self, val: bool) -> Self: ...
+    def lines(self, val: bool) -> Self: ...
+    def compression(self, comptype: CompressionType) -> Self: ...
+    def build(self) -> JsonWriterOptions: ...
+
+def write_json(options: JsonWriterOptions) -> None: ...
 def chunked_read_json(
-    source_info: SourceInfo,
-    dtypes: list[NameAndType] | None = None,
-    compression: CompressionType = CompressionType.AUTO,
-    keep_quotes: bool = False,
-    mixed_types_as_string: bool = False,
-    prune_columns: bool = False,
-    recovery_mode: JSONRecoveryMode = JSONRecoveryMode.FAIL,
+    options: JsonReaderOptions,
     chunk_size: int = 100_000_000,
 ) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ...
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index ad2989925c9..cf286378902 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -1,6 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp cimport bool
-from libcpp.limits cimport numeric_limits
 from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -17,13 +16,20 @@ from pylibcudf.libcudf.io.json cimport (
 )
 from pylibcudf.libcudf.io.types cimport (
     compression_type,
-    table_metadata,
     table_with_metadata,
 )
 from pylibcudf.libcudf.types cimport data_type, size_type
 from pylibcudf.types cimport DataType
 
-__all__ = ["chunked_read_json", "read_json", "write_json"]
+__all__ = [
+    "chunked_read_json",
+    "read_json",
+    "write_json",
+    "JsonReaderOptions",
+    "JsonReaderOptionsBuilder",
+    "JsonWriterOptions",
+    "JsonWriterOptionsBuilder"
+]
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
     cdef map[string, schema_element] schema_map
@@ -47,21 +53,21 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
     return schema_map
 
 
-cdef json_reader_options _setup_json_reader_options(
+cpdef JsonReaderOptions _setup_json_reader_options(
         SourceInfo source_info,
         list dtypes,
-        compression_type compression,
-        bool lines,
-        size_t byte_range_offset,
-        size_t byte_range_size,
-        bool keep_quotes,
-        bool mixed_types_as_string,
-        bool prune_columns,
-        json_recovery_mode_t recovery_mode):
-
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = (
-        json_reader_options.builder(source_info.c_obj)
+        compression_type compression = compression_type.AUTO,
+        bool lines = False,
+        size_t byte_range_offset = 0,
+        size_t byte_range_size = 0,
+        bool keep_quotes = False,
+        bool mixed_types_as_string = False,
+        bool prune_columns = False,
+        json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+        dict extra_parameters=None,
+):
+    options = (
+        JsonReaderOptions.builder(source_info)
         .compression(compression)
         .lines(lines)
         .byte_range_offset(byte_range_offset)
@@ -71,55 +77,359 @@ cdef json_reader_options _setup_json_reader_options(
     )
 
     if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        options.set_dtypes(dtypes)
+
+    options.enable_keep_quotes(keep_quotes)
+    options.enable_mixed_types_as_string(mixed_types_as_string)
+    options.enable_prune_columns(prune_columns)
+
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+    if extra_parameters is not None:
+        for key, value in extra_parameters.items():
+            if key == 'delimiter':
+                options.set_delimiter(value)
+            elif key == 'dayfirst':
+                options.enable_dayfirst(value)
+            elif key == 'experimental':
+                options.enable_experimental(value)
+            elif key == 'normalize_single_quotes':
+                options.enable_normalize_single_quotes(value)
+            elif key == 'normalize_whitespace':
+                options.enable_normalize_whitespace(value)
+            elif key == 'strict_validation':
+                options.set_strict_validation(value)
+            elif key == 'allow_unquoted_control_chars':
+                options.allow_unquoted_control_chars(value)
+            elif key == 'allow_numeric_leading_zeros':
+                options.allow_numeric_leading_zeros(value)
+            elif key == 'allow_nonnumeric_numbers':
+                options.allow_nonnumeric_numbers(value)
+            elif key == 'na_values':
+                options.set_na_values(value)
+            else:
+                raise ValueError(
+                    "cudf engine doesn't support the "
+                    f"'{key}' keyword argument for read_json"
+                )
+    return options
+
+
+cdef class JsonReaderOptions:
+    """
+    The settings to use for ``read_json``
+
+    For details, see `:cpp:class:`cudf::io::json_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a JsonReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::json_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the JSON file from.
+
+        Returns
+        -------
+        JsonReaderOptionsBuilder
+            Builder to build JsonReaderOptions
+        """
+        cdef JsonReaderOptionsBuilder json_builder = (
+            JsonReaderOptionsBuilder.__new__(JsonReaderOptionsBuilder)
+        )
+        json_builder.c_obj = json_reader_options.builder(source.c_obj)
+        json_builder.source = source
+        return json_builder
+
+    cpdef void set_dtypes(self, list types):
+        """
+        Set data types for columns to be read.
+
+        Parameters
+        ----------
+        types : list
+            List of dtypes or a list of tuples of
+            column names, dtypes, and list of tuples
+            (to support nested column hierarchy)
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[data_type] types_vec
+        if isinstance(types[0], tuple):
+            self.c_obj.set_dtypes(_generate_schema_map(types))
         else:
-            for dtype in dtypes:
+            types_vec.reserve(len(types))
+            for dtype in types:
                 types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-    return opts
+            self.c_obj.set_dtypes(types_vec)
+
+    cpdef void enable_keep_quotes(self, bool keep_quotes):
+        """
+        Set whether the reader should keep quotes of string values.
+
+        Parameters
+        ----------
+        keep_quotes : bool
+           Boolean value to indicate whether the reader should
+           keep quotes of string values
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_keep_quotes(keep_quotes)
+
+    cpdef void enable_mixed_types_as_string(self, bool mixed_types_as_string):
+        """
+        Set whether to parse mixed types as a string column.
+        Also enables forcing to read a struct as string column using schema.
+
+        Parameters
+        ----------
+        mixed_types_as_string : bool
+           Boolean value to enable/disable parsing mixed types
+           as a string column
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_mixed_types_as_string(mixed_types_as_string)
+
+    cpdef void enable_prune_columns(self, bool prune_columns):
+        """
+        Set whether to prune columns on read, selected
+        based on the ``set_dtypes`` option.
+
+        Parameters
+        ----------
+        prune_columns : bool
+           When set as true, if the reader options include
+           ``set_dtypes``, then the reader will only return those
+           columns which are mentioned in ``set_dtypes``. If false,
+           then all columns are returned, independent of the
+           ``set_dtypes`` setting.
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_prune_columns(prune_columns)
+
+    cpdef void set_byte_range_offset(self, size_t offset):
+        """
+        Set number of bytes to skip from source start.
+
+        Parameters
+        ----------
+        offset : size_t
+            Number of bytes of offset
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_byte_range_offset(offset)
+
+    cpdef void set_byte_range_size(self, size_t size):
+        """
+        Set number of bytes to read.
+
+        Parameters
+        ----------
+        size : size_t
+            Number of bytes to read
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_byte_range_size(size)
+
+    cpdef void enable_lines(self, bool val):
+        """
+        Set whether to read the file as a json object per line.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable the option
+            to read each line as a json object
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.enable_lines(val)
+
+    # These hidden options are subjected to change without deprecation cycle.
+    # These are used to test libcudf JSON reader features, not used in cuDF.
+
+    cpdef void set_delimiter(self, str val):
+        self.c_obj.set_delimiter(val.encode())
+
+    cpdef void enable_dayfirst(self, bool val):
+        self.c_obj.enable_dayfirst(val)
+
+    cpdef void enable_experimental(self, bool val):
+        self.c_obj.enable_experimental(val)
+
+    cpdef void enable_normalize_single_quotes(self, bool val):
+        self.c_obj.enable_normalize_single_quotes(val)
+
+    cpdef void enable_normalize_whitespace(self, bool val):
+        self.c_obj.enable_normalize_whitespace(val)
+
+    cpdef void set_strict_validation(self, bool val):
+        self.c_obj.set_strict_validation(val)
+
+    cpdef void allow_unquoted_control_chars(self, bool val):
+        self.c_obj.allow_unquoted_control_chars(val)
+
+    cpdef void allow_numeric_leading_zeros(self, bool val):
+        self.c_obj.allow_numeric_leading_zeros(val)
+
+    cpdef void allow_nonnumeric_numbers(self, bool val):
+        self.c_obj.allow_nonnumeric_numbers(val)
+
+    cpdef void set_na_values(self, list vals):
+        cdef vector[string] vec
+        for val in vals:
+            if isinstance(val, str):
+                vec.push_back(val.encode())
+        self.c_obj.set_na_values(vec)
+
+
+cdef class JsonReaderOptionsBuilder:
+    cpdef JsonReaderOptionsBuilder compression(self, compression_type compression):
+        """
+        Sets compression type.
+
+        Parameters
+        ----------
+        compression : CompressionType
+            The compression type to use
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(compression)
+        return self
+
+    cpdef JsonReaderOptionsBuilder lines(self, bool val):
+        """
+        Set whether to read the file as a json object per line.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable the option
+            to read each line as a json object
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.lines(val)
+        return self
+
+    cpdef JsonReaderOptionsBuilder keep_quotes(self, bool val):
+        """
+        Set whether the reader should keep quotes of string values.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to indicate whether the
+            reader should keep quotes of string values
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.keep_quotes(val)
+        return self
+
+    cpdef JsonReaderOptionsBuilder byte_range_offset(self, size_t byte_range_offset):
+        """
+        Set number of bytes to skip from source start.
+
+        Parameters
+        ----------
+        byte_range_offset : size_t
+            Number of bytes of offset
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.byte_range_offset(byte_range_offset)
+        return self
+
+    cpdef JsonReaderOptionsBuilder byte_range_size(self, size_t byte_range_size):
+        """
+        Set number of bytes to read.
+
+        Parameters
+        ----------
+        byte_range_size : size_t
+            Number of bytes to read
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.byte_range_size(byte_range_size)
+        return self
+
+    cpdef JsonReaderOptionsBuilder recovery_mode(
+        self,
+        json_recovery_mode_t recovery_mode
+    ):
+        """
+        Specifies the JSON reader's behavior on invalid JSON lines.
+
+        Parameters
+        ----------
+        recovery_mode : json_recovery_mode_t
+            An enum value to indicate the JSON reader's
+            behavior on invalid JSON lines.
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.recovery_mode(recovery_mode)
+        return self
+
+    cpdef build(self):
+        """Create a JsonReaderOptions object"""
+        cdef JsonReaderOptions json_options = JsonReaderOptions.__new__(
+            JsonReaderOptions
+        )
+        json_options.c_obj = move(self.c_obj.build())
+        json_options.source = self.source
+        return json_options
 
 
 cpdef tuple chunked_read_json(
-    SourceInfo source_info,
-    list dtypes = None,
-    compression_type compression = compression_type.AUTO,
-    bool keep_quotes = False,
-    bool mixed_types_as_string = False,
-    bool prune_columns = False,
-    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    JsonReaderOptions options,
     int chunk_size=100_000_000,
 ):
-    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+    """
+    Reads chunks of a JSON file into a :py:class:`~.types.TableWithMetadata`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the JSON file from.
-    dtypes : list, default None
-        Set data types for the columns in the JSON file.
-
-        Each element of the list has the format
-        (column_name, column_dtype, list of child dtypes), where
-        the list of child dtypes is an empty list if the child is not
-        a nested type (list or struct dtype), and is of format
-        (column_child_name, column_child_type, list of grandchild dtypes).
-    compression: CompressionType, default CompressionType.AUTO
-        The compression format of the JSON source.
-    keep_quotes : bool, default False
-        Whether the reader should keep quotes of string values.
-    mixed_types_as_string : bool, default False
-        If True, mixed type columns are returned as string columns.
-        If `False` parsing mixed type columns will thrown an error.
-    prune_columns : bool, default False
-        Whether to only read columns specified in dtypes.
-    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
-        Whether to raise an error or set corresponding values to null
-        when encountering an invalid JSON line.
+    options : JsonReaderOptions
+        Settings for controlling reading behavior
     chunk_size : int, default 100_000_000 bytes.
         The number of bytes to be read in chunks.
         The chunk_size should be set to at least row_size.
@@ -132,20 +442,6 @@ cpdef tuple chunked_read_json(
     cdef size_type c_range_size = (
         chunk_size if chunk_size is not None else 0
     )
-    cdef json_reader_options opts = _setup_json_reader_options(
-        source_info=source_info,
-        dtypes=dtypes,
-        compression=compression,
-        lines=True,
-        byte_range_offset=0,
-        byte_range_size=0,
-        keep_quotes=keep_quotes,
-        mixed_types_as_string=mixed_types_as_string,
-        prune_columns=prune_columns,
-        recovery_mode=recovery_mode,
-    )
-
-    # Read JSON
     cdef table_with_metadata c_result
 
     final_columns = []
@@ -153,12 +449,13 @@ cpdef tuple chunked_read_json(
     child_names = None
     i = 0
     while True:
-        opts.set_byte_range_offset(c_range_size * i)
-        opts.set_byte_range_size(c_range_size)
+        options.enable_lines(True)
+        options.set_byte_range_offset(c_range_size * i)
+        options.set_byte_range_size(c_range_size)
 
         try:
             with nogil:
-                c_result = move(cpp_read_json(opts))
+                c_result = move(cpp_read_json(options.c_obj))
         except (ValueError, OverflowError):
             break
         if meta_names is None:
@@ -186,125 +483,229 @@ cpdef tuple chunked_read_json(
 
 
 cpdef TableWithMetadata read_json(
-    SourceInfo source_info,
-    list dtypes = None,
-    compression_type compression = compression_type.AUTO,
-    bool lines = False,
-    size_t byte_range_offset = 0,
-    size_t byte_range_size = 0,
-    bool keep_quotes = False,
-    bool mixed_types_as_string = False,
-    bool prune_columns = False,
-    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    JsonReaderOptions options
 ):
-    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+    """
+    Read from JSON format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`read_json`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the JSON file from.
-    dtypes : list, default None
-        Set data types for the columns in the JSON file.
-
-        Each element of the list has the format
-        (column_name, column_dtype, list of child dtypes), where
-        the list of child dtypes is an empty list if the child is not
-        a nested type (list or struct dtype), and is of format
-        (column_child_name, column_child_type, list of grandchild dtypes).
-    compression: CompressionType, default CompressionType.AUTO
-        The compression format of the JSON source.
-    byte_range_offset : size_t, default 0
-        Number of bytes to skip from source start.
-    byte_range_size : size_t, default 0
-        Number of bytes to read. By default, will read all bytes.
-    keep_quotes : bool, default False
-        Whether the reader should keep quotes of string values.
-    mixed_types_as_string : bool, default False
-        If True, mixed type columns are returned as string columns.
-        If `False` parsing mixed type columns will thrown an error.
-    prune_columns : bool, default False
-        Whether to only read columns specified in dtypes.
-    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
-        Whether to raise an error or set corresponding values to null
-        when encountering an invalid JSON line.
+    options: JsonReaderOptions
+        Settings for controlling reading behavior
 
     Returns
     -------
     TableWithMetadata
         The Table and its corresponding metadata (column names) that were read in.
     """
-    cdef json_reader_options opts = _setup_json_reader_options(
-        source_info=source_info,
-        dtypes=dtypes,
-        compression=compression,
-        lines=lines,
-        byte_range_offset=byte_range_offset,
-        byte_range_size=byte_range_size,
-        keep_quotes=keep_quotes,
-        mixed_types_as_string=mixed_types_as_string,
-        prune_columns=prune_columns,
-        recovery_mode=recovery_mode,
-    )
-
-    # Read JSON
     cdef table_with_metadata c_result
 
     with nogil:
-        c_result = move(cpp_read_json(opts))
+        c_result = move(cpp_read_json(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
 
-cpdef void write_json(
-    SinkInfo sink_info,
-    TableWithMetadata table_w_meta,
-    str na_rep = "",
-    bool include_nulls = False,
-    bool lines = False,
-    size_type rows_per_chunk = numeric_limits[size_type].max(),
-    str true_value = "true",
-    str false_value = "false"
-):
+cdef class JsonWriterOptions:
     """
-    Writes a :py:class:`~pylibcudf.table.Table` to JSON format.
+    The settings to use for ``write_json``
 
-    Parameters
-    ----------
-    sink_info: SinkInfo
-        The SinkInfo object to write the JSON to.
-    table_w_meta: TableWithMetadata
-        The TableWithMetadata object containing the Table to write
-    na_rep: str, default ""
-        The string representation for null values.
-    include_nulls: bool, default False
+    For details, see :cpp:class:`cudf::io::json_writer_options`
+    """
+    @staticmethod
+    def builder(SinkInfo sink, Table table):
+        """
+        Create a JsonWriterOptionsBuilder object
+
+        Parameters
+        ----------
+        sink : SinkInfo
+            The sink used for writer output
+        table : Table
+            Table to be written to output
+
+        Returns
+        -------
+        JsonWriterOptionsBuilder
+            Builder to build JsonWriterOptions
+        """
+        cdef JsonWriterOptionsBuilder json_builder = (
+            JsonWriterOptionsBuilder.__new__(JsonWriterOptionsBuilder)
+        )
+        json_builder.c_obj = json_writer_options.builder(sink.c_obj, table.view())
+        json_builder.sink = sink
+        json_builder.table = table
+        return json_builder
+
+    cpdef void set_rows_per_chunk(self, size_type val):
+        """
+        Sets string to used for null entries.
+
+        Parameters
+        ----------
+        val : size_type
+            String to represent null value
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_rows_per_chunk(val)
+
+    cpdef void set_true_value(self, str val):
+        """
+        Sets string used for values != 0
+
+        Parameters
+        ----------
+        val : str
+            String to represent values != 0
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_true_value(val.encode())
+
+    cpdef void set_false_value(self, str val):
+        """
+        Sets string used for values == 0
+
+        Parameters
+        ----------
+        val : str
+            String to represent values == 0
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_false_value(val.encode())
+
+    cpdef void set_compression(self, compression_type comptype):
+        """
+        Sets compression type to be used
+
+        Parameters
+        ----------
+        comptype : CompressionType
+            Compression type for sink
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_compression(comptype)
+
+cdef class JsonWriterOptionsBuilder:
+    cpdef JsonWriterOptionsBuilder metadata(self, TableWithMetadata tbl_w_meta):
+        """
+        Sets optional metadata (with column names).
+
+        Parameters
+        ----------
+        tbl_w_meta : TableWithMetadata
+            Associated metadata
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.metadata(tbl_w_meta.metadata)
+        return self
+
+    cpdef JsonWriterOptionsBuilder na_rep(self, str val):
+        """
+        Sets string to used for null entries.
+
+        Parameters
+        ----------
+        val : str
+            String to represent null value
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.na_rep(val.encode())
+        return self
+
+    cpdef JsonWriterOptionsBuilder include_nulls(self, bool val):
+        """
         Enables/Disables output of nulls as 'null'.
-    lines: bool, default False
-        If `True`, write output in the JSON lines format.
-    rows_per_chunk: size_type, defaults to length of the input table
-        The maximum number of rows to write at a time.
-    true_value: str, default "true"
-        The string representation for values != 0 in INT8 types.
-    false_value: str, default "false"
-        The string representation for values == 0 in INT8 types.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.include_nulls(val)
+        return self
+
+    cpdef JsonWriterOptionsBuilder lines(self, bool val):
+        """
+        Enables/Disables JSON lines for records format.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.lines(val)
+        return self
+
+    cpdef JsonWriterOptionsBuilder compression(self, compression_type comptype):
+        """
+        Sets compression type of output sink.
+
+        Parameters
+        ----------
+        comptype : CompressionType
+            Compression type used
+
+        Returns
+        -------
+        Self
+        """
+        self.c_obj.compression(comptype)
+        return self
+
+    cpdef JsonWriterOptions build(self):
+        """Create a JsonWriterOptions object"""
+        cdef JsonWriterOptions json_options = JsonWriterOptions.__new__(
+            JsonWriterOptions
+        )
+        json_options.c_obj = move(self.c_obj.build())
+        json_options.sink = self.sink
+        json_options.table = self.table
+        return json_options
+
+
+cpdef void write_json(JsonWriterOptions options):
     """
-    cdef table_metadata tbl_meta = table_w_meta.metadata
-    cdef string na_rep_c = na_rep.encode()
-
-    cdef json_writer_options options = (
-        json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
-        .metadata(tbl_meta)
-        .na_rep(na_rep_c)
-        .include_nulls(include_nulls)
-        .lines(lines)
-        .build()
-    )
+    Writes a set of columns to JSON format.
 
-    if rows_per_chunk != numeric_limits[size_type].max():
-        options.set_rows_per_chunk(rows_per_chunk)
-    if true_value != "true":
-        options.set_true_value(<string>true_value.encode())
-    if false_value != "false":
-        options.set_false_value(<string>false_value.encode())
+    Parameters
+    ----------
+    options : JsonWriterOptions
+        Settings for controlling writing behavior
 
+    Returns
+    -------
+    None
+    """
     with nogil:
-        cpp_write_json(options)
+        cpp_write_json(options.c_obj)
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
index 671f0692444..7531608519c 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from libc.stdint cimport uint64_t
+from libc.stdint cimport uint64_t, int64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.orc_metadata cimport (
 )
 from pylibcudf.libcudf.io.orc cimport (
     orc_chunked_writer,
+    orc_reader_options,
+    orc_reader_options_builder,
     orc_writer_options,
     orc_writer_options_builder,
     chunked_orc_writer_options,
@@ -32,17 +34,23 @@ from pylibcudf.libcudf.io.types cimport (
     statistics_freq,
 )
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = *,
-    list stripes = *,
-    size_type skip_rows = *,
-    size_type nrows = *,
-    bool use_index = *,
-    bool use_np_dtypes = *,
-    DataType timestamp_type = *,
-    list decimal128_columns = *
-)
+cdef class OrcReaderOptions:
+    cdef orc_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_num_rows(self, int64_t nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_stripes(self, list stripes)
+    cpdef void set_decimal128_columns(self, list val)
+    cpdef void set_timestamp_type(self, DataType type_)
+    cpdef void set_columns(self, list col_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cdef orc_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use)
+    cpdef OrcReaderOptions build(self)
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options)
 
 cdef class OrcColumnStatistics:
     cdef optional[uint64_t] number_of_values_c
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
index 516f97981e9..c496b7a2152 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyi
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -1,6 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from typing import Any, Self
+from typing import Any
+
+from typing_extensions import Self
 
 from pylibcudf.io.types import (
     CompressionType,
@@ -11,19 +13,21 @@ from pylibcudf.io.types import (
     TableWithMetadata,
 )
 from pylibcudf.table import Table
-from pylibcudf.types import DataType
 
-def read_orc(
-    source_info: SourceInfo,
-    columns: list[str] | None = None,
-    stripes: list[list[int]] | None = None,
-    skip_rows: int = 0,
-    nrows: int = -1,
-    use_index: bool = True,
-    use_np_dtypes: bool = True,
-    timestamp_type: DataType | None = None,
-    decimal128_columns: list[str] | None = None,
-) -> TableWithMetadata: ...
+class OrcReaderOptions:
+    def set_num_rows(self, nrows: int) -> None: ...
+    def set_skip_rows(self, skip_rows: int) -> None: ...
+    def set_stripes(self, stripes: list[list[int]]) -> None: ...
+    def set_decimal128_columns(self, val: list[str]) -> None: ...
+    def set_columns(self, col_names: list[str]) -> None: ...
+    @staticmethod
+    def builder(source: SourceInfo) -> OrcReaderOptionsBuilder: ...
+
+class OrcReaderOptionsBuilder:
+    def use_index(self, use: bool) -> Self: ...
+    def build(self) -> OrcReaderOptions: ...
+
+def read_orc(options: OrcReaderOptions) -> TableWithMetadata: ...
 
 class OrcColumnStatistics:
     def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 63eab4a9634..c125d7e76fa 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -46,6 +46,8 @@ __all__ = [
     "read_orc",
     "read_parsed_orc_statistics",
     "write_orc",
+    "OrcReaderOptions",
+    "OrcReaderOptionsBuilder",
     "OrcWriterOptions",
     "OrcWriterOptionsBuilder",
     "OrcChunkedWriter",
@@ -237,84 +239,190 @@ cdef class ParsedOrcStatistics:
         return out
 
 
-cpdef TableWithMetadata read_orc(
-    SourceInfo source_info,
-    list columns = None,
-    list stripes = None,
-    size_type skip_rows = 0,
-    size_type nrows = -1,
-    bool use_index = True,
-    bool use_np_dtypes = True,
-    DataType timestamp_type = None,
-    list decimal128_columns = None,
-):
-    """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`.
-
-    Parameters
-    ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    stripes : list[list[size_type]], default None
-        List of stripes to be read.
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    use_index : bool, default True
-        Whether to use the row index to speed up reading.
-    use_np_dtypes : bool, default True
-        Whether to use numpy compatible dtypes.
-    timestamp_type : DataType, default None
-        The timestamp type to use for the timestamp columns.
-    decimal128_columns : list, default None
-        List of column names to be read as 128-bit decimals.
+cdef class OrcReaderOptions:
+    """
+    The settings to use for ``read_orc``
 
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    For details, see :cpp:class:`cudf::io::orc_reader_options`
     """
-    cdef orc_reader_options opts
-    cdef vector[vector[size_type]] c_stripes
-    opts = (
-        orc_reader_options.builder(source_info.c_obj)
-        .use_index(use_index)
-        .build()
-    )
-    if nrows >= 0:
-        opts.set_num_rows(nrows)
-    if skip_rows >= 0:
-        opts.set_skip_rows(skip_rows)
-    if stripes is not None:
-        c_stripes = stripes
-        opts.set_stripes(c_stripes)
-    if timestamp_type is not None:
-        opts.set_timestamp_type(timestamp_type.c_obj)
-
-    cdef vector[string] c_decimal128_columns
-    if decimal128_columns is not None and len(decimal128_columns) > 0:
-        c_decimal128_columns.reserve(len(decimal128_columns))
-        for col in decimal128_columns:
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a OrcReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::orc_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the ORC file from.
+
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+            Builder to build OrcReaderOptions
+        """
+        cdef OrcReaderOptionsBuilder orc_builder = (
+            OrcReaderOptionsBuilder.__new__(OrcReaderOptionsBuilder)
+        )
+        orc_builder.c_obj = orc_reader_options.builder(source.c_obj)
+        orc_builder.source = source
+        return orc_builder
+
+    cpdef void set_num_rows(self, int64_t nrows):
+        """
+        Sets number of row to read.
+
+        Parameters
+        ----------
+        nrows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip from the start.
+
+        Parameters
+        ----------
+        skip_rows: int64_t
+            Number of rows
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_stripes(self, list stripes):
+        """
+        Sets list of stripes to read for each input source.
+
+        Parameters
+        ----------
+        stripes: list[list[size_type]]
+            List of lists, mapping stripes to read to input sources
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] c_stripes
+        cdef vector[size_type] vec
+        for sub_list in stripes:
+            for x in sub_list:
+                vec.push_back(x)
+            c_stripes.push_back(vec)
+            vec.clear()
+        self.c_obj.set_stripes(c_stripes)
+
+    cpdef void set_decimal128_columns(self, list val):
+        """
+        Set columns that should be read as 128-bit Decimal.
+
+        Parameters
+        ----------
+        val: list[str]
+            List of fully qualified column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_decimal128_columns
+        c_decimal128_columns.reserve(len(val))
+        for col in val:
             if not isinstance(col, str):
                 raise TypeError("Decimal 128 column names must be strings!")
             c_decimal128_columns.push_back(col.encode())
-        opts.set_decimal128_columns(c_decimal128_columns)
+        self.c_obj.set_decimal128_columns(c_decimal128_columns)
+
+    cpdef void set_timestamp_type(self, DataType type_):
+        """
+        Sets timestamp type to which timestamp column will be cast.
+
+        Parameters
+        ----------
+        type_: DataType
+            Type of timestamp
 
-    cdef vector[string] c_column_names
-    if columns is not None and len(columns) > 0:
-        c_column_names.reserve(len(columns))
-        for col in columns:
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_timestamp_type(type_.c_obj)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the column to read.
+
+        Parameters
+        ----------
+        col_names: list[str]
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] c_column_names
+        c_column_names.reserve(len(col_names))
+        for col in col_names:
             if not isinstance(col, str):
                 raise TypeError("Column names must be strings!")
             c_column_names.push_back(col.encode())
-        opts.set_columns(c_column_names)
+        self.c_obj.set_columns(c_column_names)
+
+cdef class OrcReaderOptionsBuilder:
+    cpdef OrcReaderOptionsBuilder use_index(self, bool use):
+        """
+        Enable/Disable use of row index to speed-up reading.
+
+        Parameters
+        ----------
+        use : bool
+            Boolean value to enable/disable row index use
 
+        Returns
+        -------
+        OrcReaderOptionsBuilder
+        """
+        self.c_obj.use_index(use)
+        return self
+
+    cpdef OrcReaderOptions build(self):
+        """Create a OrcReaderOptions object"""
+        cdef OrcReaderOptions orc_options = OrcReaderOptions.__new__(
+            OrcReaderOptions
+        )
+        orc_options.c_obj = move(self.c_obj.build())
+        orc_options.source = self.source
+        return orc_options
+
+
+cpdef TableWithMetadata read_orc(OrcReaderOptions options):
+    """
+    Read from ORC format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
+
+    For details, see :cpp:func:`read_orc`.
+
+    Parameters
+    ----------
+    options: OrcReaderOptions
+        Settings for controlling reading behavior
+    """
     cdef table_with_metadata c_result
 
     with nogil:
-        c_result = move(cpp_read_orc(opts))
+        c_result = move(cpp_read_orc(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
@@ -503,7 +611,7 @@ cpdef void write_orc(OrcWriterOptions options):
     The table to write, output paths, and options are encapsulated
     by the `options` object.
 
-    For details, see :cpp:func:`write_csv`.
+    For details, see :cpp:func:`write_orc`.
 
     Parameters
     ----------
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 7bd6ba91ca9..84f47cf5305 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -19,6 +19,8 @@ from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_writer_options,
     parquet_writer_options_builder,
+    parquet_reader_options,
+    parquet_reader_options_builder,
     chunked_parquet_writer_options,
     chunked_parquet_writer_options_builder,
 )
@@ -27,6 +29,25 @@ from pylibcudf.table cimport Table
 from pylibcudf.types cimport DataType
 
 
+cdef class ParquetReaderOptions:
+    cdef parquet_reader_options c_obj
+    cdef SourceInfo source
+    cpdef void set_row_groups(self, list row_groups)
+    cpdef void set_num_rows(self, size_type nrows)
+    cpdef void set_skip_rows(self, int64_t skip_rows)
+    cpdef void set_columns(self, list col_names)
+    cpdef void set_filter(self, Expression filter)
+
+cdef class ParquetReaderOptionsBuilder:
+    cdef parquet_reader_options_builder c_obj
+    cdef SourceInfo source
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val)
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val)
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val)
+    cpdef build(self)
+
+
 cdef class ChunkedParquetReader:
     cdef unique_ptr[cpp_chunked_parquet_reader] reader
 
@@ -34,20 +55,7 @@ cdef class ChunkedParquetReader:
     cpdef TableWithMetadata read_chunk(self)
 
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = *,
-    list row_groups = *,
-    Expression filters = *,
-    bool convert_strings_to_categories = *,
-    bool use_pandas_metadata = *,
-    int64_t skip_rows = *,
-    size_type nrows = *,
-    bool allow_mismatched_pq_schemas = *,
-    # disabled see comment in parquet.pyx for more
-    # ReaderColumnSchema reader_column_schema = *,
-    # DataType timestamp_type = *
-)
+cpdef read_parquet(ParquetReaderOptions options)
 
 
 cdef class ParquetChunkedWriter:
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index 22bea1abd8e..2d8d12c1a45 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -1,7 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from collections.abc import Mapping
-from typing import Self
+
+from typing_extensions import Self
 
 from pylibcudf.expressions import Expression
 from pylibcudf.io.types import (
@@ -16,6 +17,24 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 
+class ParquetReaderOptions:
+    def __init__(self): ...
+    def set_row_groups(self, row_groups: list[list[int]]): ...
+    def set_num_rows(self, nrows: int): ...
+    def set_skip_rows(self, skip_rows: int): ...
+    def set_columns(self, col_names: list[str]): ...
+    def set_filter(self, filter: Expression): ...
+    @staticmethod
+    def builder(source: SourceInfo) -> ParquetReaderOptionsBuilder: ...
+
+class ParquetReaderOptionsBuilder:
+    def __init__(self): ...
+    def convert_strings_to_categories(self, val: bool) -> Self: ...
+    def use_pandas_metadata(self, val: bool) -> Self: ...
+    def allow_mismatched_pq_schemas(self, val: bool) -> Self: ...
+    def use_arrow_schema(self, val: bool) -> Self: ...
+    def build(self) -> ParquetReaderOptions: ...
+
 class ChunkedParquetReader:
     def __init__(
         self,
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index 9bdf849a30c..672fe2be847 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -42,47 +42,204 @@ __all__ = [
     "ParquetWriterOptionsBuilder",
     "read_parquet",
     "write_parquet",
+    "ParquetReaderOptions",
+    "ParquetReaderOptionsBuilder",
     "ChunkedParquetWriterOptions",
     "ChunkedParquetWriterOptionsBuilder"
     "merge_row_group_metadata",
 ]
 
-cdef parquet_reader_options _setup_parquet_reader_options(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas=False,
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    cdef vector[string] col_vec
-    cdef parquet_reader_options opts = (
-        parquet_reader_options.builder(source_info.c_obj)
-        .convert_strings_to_categories(convert_strings_to_categories)
-        .use_pandas_metadata(use_pandas_metadata)
-        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
-        .use_arrow_schema(True)
-        .build()
-    )
-    if row_groups is not None:
-        opts.set_row_groups(row_groups)
-    if nrows != -1:
-        opts.set_num_rows(nrows)
-    if skip_rows != 0:
-        opts.set_skip_rows(skip_rows)
-    if columns is not None:
-        col_vec.reserve(len(columns))
-        for col in columns:
-            col_vec.push_back(<string>str(col).encode())
-        opts.set_columns(col_vec)
-    if filters is not None:
-        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
-    return opts
+
+cdef class ParquetReaderOptions:
+    """The settings to use for ``read_parquet``
+    For details, see :cpp:class:`cudf::io::parquet_reader_options`
+    """
+    @staticmethod
+    def builder(SourceInfo source):
+        """
+        Create a ParquetReaderOptionsBuilder object
+
+        For details, see :cpp:func:`cudf::io::parquet_reader_options::builder`
+
+        Parameters
+        ----------
+        sink : SourceInfo
+            The source to read the Parquet file from.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+            Builder to build ParquetReaderOptions
+        """
+        cdef ParquetReaderOptionsBuilder parquet_builder = (
+            ParquetReaderOptionsBuilder.__new__(ParquetReaderOptionsBuilder)
+        )
+        parquet_builder.c_obj = parquet_reader_options.builder(source.c_obj)
+        parquet_builder.source = source
+        return parquet_builder
+
+    cpdef void set_row_groups(self, list row_groups):
+        """
+        Sets list of individual row groups to read.
+
+        Parameters
+        ----------
+        row_groups : list
+            List of row groups to read
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[vector[size_type]] outer
+        cdef vector[size_type] inner
+        for row_group in row_groups:
+            for x in row_group:
+                inner.push_back(x)
+            outer.push_back(inner)
+            inner.clear()
+
+        self.c_obj.set_row_groups(outer)
+
+    cpdef void set_num_rows(self, size_type nrows):
+        """
+        Sets number of rows to read.
+
+        Parameters
+        ----------
+        nrows : size_type
+            Number of rows to read after skip
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_num_rows(nrows)
+
+    cpdef void set_skip_rows(self, int64_t skip_rows):
+        """
+        Sets number of rows to skip.
+
+        Parameters
+        ----------
+        skip_rows : int64_t
+            Number of rows to skip from start
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_skip_rows(skip_rows)
+
+    cpdef void set_columns(self, list col_names):
+        """
+        Sets names of the columns to be read.
+
+        Parameters
+        ----------
+        col_names : list
+            List of column names
+
+        Returns
+        -------
+        None
+        """
+        cdef vector[string] vec
+        for name in col_names:
+            vec.push_back(<string>str(name).encode())
+        self.c_obj.set_columns(vec)
+
+    cpdef void set_filter(self, Expression filter):
+        """
+        Sets AST based filter for predicate pushdown.
+
+        Parameters
+        ----------
+        filter : Expression
+            AST expression to use as filter
+
+        Returns
+        -------
+        None
+        """
+        self.c_obj.set_filter(<expression &>dereference(filter.c_obj.get()))
+
+
+cdef class ParquetReaderOptionsBuilder:
+    cpdef ParquetReaderOptionsBuilder convert_strings_to_categories(self, bool val):
+        """
+        Sets enable/disable conversion of strings to categories.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value to enable/disable conversion of string columns to categories
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.convert_strings_to_categories(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_pandas_metadata(self, bool val):
+        """
+        Sets to enable/disable use of pandas metadata to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use pandas metadata
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_pandas_metadata(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder allow_mismatched_pq_schemas(self, bool val):
+        """
+        Sets to enable/disable reading of matching projected and filter
+        columns from mismatched Parquet sources.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to read matching projected and filter
+            columns from mismatched Parquet sources.
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.allow_mismatched_pq_schemas(val)
+        return self
+
+    cpdef ParquetReaderOptionsBuilder use_arrow_schema(self, bool val):
+        """
+        Sets to enable/disable use of arrow schema to read.
+
+        Parameters
+        ----------
+        val : bool
+            Boolean value whether to use arrow schema
+
+        Returns
+        -------
+        ParquetReaderOptionsBuilder
+        """
+        self.c_obj.use_arrow_schema(val)
+        return self
+
+    cpdef build(self):
+        """Create a ParquetReaderOptions object"""
+        cdef ParquetReaderOptions parquet_options = ParquetReaderOptions.__new__(
+            ParquetReaderOptions
+        )
+        parquet_options.c_obj = move(self.c_obj.build())
+        parquet_options.source = self.source
+        return parquet_options
 
 
 cdef class ChunkedParquetReader:
@@ -93,63 +250,27 @@ cdef class ChunkedParquetReader:
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The names of the columns to be read
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
+    options : ParquetReaderOptions
+        Settings for controlling reading behavior
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
         or 0 if there is no limit.
     pass_read_limit : size_t, default 1024000000
         Limit on the amount of memory used for reading and decompressing data
         or 0 if there is no limit.
-    allow_mismatched_pq_schemas : bool, default False
-        Whether to read (matching) columns specified in `columns` from
-        the input files with otherwise mismatched schemas.
     """
     def __init__(
         self,
-        SourceInfo source_info,
-        list columns=None,
-        list row_groups=None,
-        bool use_pandas_metadata=True,
-        bool convert_strings_to_categories=False,
-        int64_t skip_rows = 0,
-        size_type nrows = -1,
+        ParquetReaderOptions options,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000,
-        bool allow_mismatched_pq_schemas=False
     ):
-
-        cdef parquet_reader_options opts = _setup_parquet_reader_options(
-            source_info,
-            columns,
-            row_groups,
-            filters=None,
-            convert_strings_to_categories=convert_strings_to_categories,
-            use_pandas_metadata=use_pandas_metadata,
-            skip_rows=skip_rows,
-            nrows=nrows,
-            allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
-        )
-
         with nogil:
             self.reader.reset(
                 new cpp_chunked_parquet_reader(
                     chunk_read_limit,
                     pass_read_limit,
-                    opts
+                    options.c_obj,
                 )
             )
 
@@ -184,69 +305,23 @@ cdef class ChunkedParquetReader:
 
         return TableWithMetadata.from_libcudf(c_result)
 
-cpdef read_parquet(
-    SourceInfo source_info,
-    list columns = None,
-    list row_groups = None,
-    Expression filters = None,
-    bool convert_strings_to_categories = False,
-    bool use_pandas_metadata = True,
-    int64_t skip_rows = 0,
-    size_type nrows = -1,
-    bool allow_mismatched_pq_schemas = False,
-    # Disabled, these aren't used by cudf-python
-    # we should only add them back in if there's user demand
-    # ReaderColumnSchema reader_column_schema = None,
-    # DataType timestamp_type = DataType(type_id.EMPTY)
-):
-    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+cpdef read_parquet(ParquetReaderOptions options):
+    """
+    Read from Parquet format.
+
+    The source to read from and options are encapsulated
+    by the `options` object.
 
     For details, see :cpp:func:`read_parquet`.
 
     Parameters
     ----------
-    source_info : SourceInfo
-        The SourceInfo object to read the Parquet file from.
-    columns : list, default None
-        The string names of the columns to be read.
-    row_groups : list[list[size_type]], default None
-        List of row groups to be read.
-    filters : Expression, default None
-        An AST :py:class:`pylibcudf.expressions.Expression`
-        to use for predicate pushdown.
-    convert_strings_to_categories : bool, default False
-        Whether to convert string columns to the category type
-    use_pandas_metadata : bool, default True
-        If True, return metadata about the index column in
-        the per-file user metadata of the ``TableWithMetadata``
-    skip_rows : int64_t, default 0
-        The number of rows to skip from the start of the file.
-    nrows : size_type, default -1
-        The number of rows to read. By default, read the entire file.
-    allow_mismatched_pq_schemas : bool, default False
-        If True, enable reading (matching) columns specified in `columns`
-        from the input files with otherwise mismatched schemas.
-
-    Returns
-    -------
-    TableWithMetadata
-        The Table and its corresponding metadata (column names) that were read in.
+    options: ParquetReaderOptions
+        Settings for controlling reading behavior
     """
-    cdef table_with_metadata c_result
-    cdef parquet_reader_options opts = _setup_parquet_reader_options(
-        source_info,
-        columns,
-        row_groups,
-        filters,
-        convert_strings_to_categories,
-        use_pandas_metadata,
-        skip_rows,
-        nrows,
-        allow_mismatched_pq_schemas,
-    )
-
     with nogil:
-        c_result = move(cpp_read_parquet(opts))
+        c_result = move(cpp_read_parquet(options.c_obj))
 
     return TableWithMetadata.from_libcudf(c_result)
 
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index a1f3b17936c..61fe33d6805 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -65,7 +65,6 @@ cdef class ColumnInMetadata:
 
 cdef class TableInputMetadata:
     cdef table_input_metadata c_obj
-    cdef list column_metadata
 
 cdef class TableWithMetadata:
     cdef public Table tbl
diff --git a/python/pylibcudf/pylibcudf/io/types.pyi b/python/pylibcudf/pylibcudf/io/types.pyi
index a3a559219ff..63fa9d1ff79 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyi
+++ b/python/pylibcudf/pylibcudf/io/types.pyi
@@ -64,6 +64,8 @@ class PartitionInfo:
 
 class TableInputMetadata:
     def __init__(self, table: Table): ...
+    @property
+    def column_metadata(self) -> list[ColumnInMetadata]: ...
 
 class ColumnInMetadata:
     def set_name(self, name: str) -> Self: ...
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index a2155829f2c..458595ca0e0 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -288,12 +288,14 @@ cdef class TableInputMetadata:
     """
     def __init__(self, Table table):
         self.c_obj = table_input_metadata(table.view())
-        self.column_metadata = [
+
+    @property
+    def column_metadata(self):
+        return [
             ColumnInMetadata.from_libcudf(&self.c_obj.column_metadata[i], self)
             for i in range(self.c_obj.column_metadata.size())
         ]
 
-
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
     (e.g. column names)
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 4e8a01b41a5..46fdf62cd6b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
         const table_view& input
     ) except +libcudf_exception_handler
 
+    cdef unique_ptr[column] xxhash_32(
+        const table_view& input,
+        const uint32_t seed
+    ) except +libcudf_exception_handler
+
     cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index a7ca6978621..d23dd0685d1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -5,6 +5,7 @@ from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -17,6 +18,7 @@ cdef extern from "cudf/io/json.hpp" \
     cdef struct schema_element:
         data_type type
         map[string, schema_element] child_types
+        optional[vector[string]] column_order
 
     cpdef enum class json_recovery_mode_t(int32_t):
         FAIL
@@ -30,30 +32,51 @@ cdef extern from "cudf/io/json.hpp" \
             except +libcudf_exception_handler
         size_t get_byte_range_offset() except +libcudf_exception_handler
         size_t get_byte_range_size() except +libcudf_exception_handler
+        size_t get_byte_range_size_with_padding() except +libcudf_exception_handler
+        size_t get_byte_range_padding() except +libcudf_exception_handler
+        char get_delimiter() except +libcudf_exception_handler
         bool is_enabled_lines() except +libcudf_exception_handler
         bool is_enabled_mixed_types_as_string() except +libcudf_exception_handler
         bool is_enabled_prune_columns() except +libcudf_exception_handler
-        bool is_enabled_dayfirst() except +libcudf_exception_handler
         bool is_enabled_experimental() except +libcudf_exception_handler
+        bool is_enabled_dayfirst() except +libcudf_exception_handler
+        bool is_enabled_keep_quotes() except +libcudf_exception_handler
+        bool is_enabled_normalize_single_quotes() except +libcudf_exception_handler
+        bool is_enabled_normalize_whitespace() except +libcudf_exception_handler
+        json_recovery_mode_t recovery_mode() except +libcudf_exception_handler
+        bool is_strict_validation() except +libcudf_exception_handler
+        bool is_allowed_numeric_leading_zeros() except +libcudf_exception_handler
+        bool is_allowed_nonnumeric_numbers() except +libcudf_exception_handler
+        bool is_allowed_unquoted_control_chars() except +libcudf_exception_handler
+        vector[string] get_na_values() except +libcudf_exception_handler
 
         # setter
-        void set_dtypes(
-            vector[data_type] types
-        ) except +libcudf_exception_handler
-        void set_dtypes(
-            map[string, schema_element] types
-        ) except +libcudf_exception_handler
-        void set_compression(
-            cudf_io_types.compression_type compression
-        ) except +libcudf_exception_handler
+        void set_dtypes(vector[data_type] types) except +libcudf_exception_handler
+        void set_dtypes(map[string, data_type] types) except +libcudf_exception_handler
+        void set_dtypes(map[string, schema_element] types)\
+            except +libcudf_exception_handler
+        void set_dtypes(schema_element types) except +libcudf_exception_handler
+        void set_compression(cudf_io_types.compression_type comp_type)\
+            except +libcudf_exception_handler
         void set_byte_range_offset(size_t offset) except +libcudf_exception_handler
         void set_byte_range_size(size_t size) except +libcudf_exception_handler
+        void set_delimiter(char delimiter) except +libcudf_exception_handler
         void enable_lines(bool val) except +libcudf_exception_handler
         void enable_mixed_types_as_string(bool val) except +libcudf_exception_handler
         void enable_prune_columns(bool val) except +libcudf_exception_handler
-        void enable_dayfirst(bool val) except +libcudf_exception_handler
         void enable_experimental(bool val) except +libcudf_exception_handler
+        void enable_dayfirst(bool val) except +libcudf_exception_handler
         void enable_keep_quotes(bool val) except +libcudf_exception_handler
+        void enable_normalize_single_quotes(bool val) except +libcudf_exception_handler
+
+        void enable_normalize_whitespace(bool val) except +libcudf_exception_handler
+        void set_recovery_mode(json_recovery_mode_t val)\
+            except +libcudf_exception_handler
+        void set_strict_validation(bool val) except +libcudf_exception_handler
+        void allow_numeric_leading_zeros(bool val) except +libcudf_exception_handler
+        void allow_nonnumeric_numbers(bool val) except +libcudf_exception_handler
+        void allow_unquoted_control_chars(bool val) except +libcudf_exception_handler
+        void set_na_values(vector[string] vals) except +libcudf_exception_handler
 
         @staticmethod
         json_reader_options_builder builder(
@@ -74,6 +97,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& dtypes(
             map[string, schema_element] types
         ) except +libcudf_exception_handler
+        json_reader_options_builder& dtypes(
+            schema_element types
+        ) except +libcudf_exception_handler
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
         ) except +libcudf_exception_handler
@@ -83,6 +109,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& byte_range_size(
             size_t size
         ) except +libcudf_exception_handler
+        json_reader_options_builder& delimiter(
+            char delimiter
+        ) except +libcudf_exception_handler
         json_reader_options_builder& lines(
             bool val
         ) except +libcudf_exception_handler
@@ -92,16 +121,36 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& prune_columns(
             bool val
         ) except +libcudf_exception_handler
+        json_reader_options_builder& experimental(
+            bool val
+        ) except +libcudf_exception_handler
         json_reader_options_builder& dayfirst(
             bool val
         ) except +libcudf_exception_handler
         json_reader_options_builder& keep_quotes(
             bool val
         ) except +libcudf_exception_handler
+        json_reader_options_builder& normalize_single_quotes(
+            bool val
+        ) except +libcudf_exception_handler
+        json_reader_options_builder& normalize_whitespace(
+            bool val
+        ) except +libcudf_exception_handler
         json_reader_options_builder& recovery_mode(
             json_recovery_mode_t val
         ) except +libcudf_exception_handler
 
+        json_reader_options_builder& strict_validation(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& numeric_leading_zeros(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& nonnumeric_numbers(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& unquoted_control_chars(bool val)\
+            except +libcudf_exception_handler
+        json_reader_options_builder& na_values(vector[string] vals)\
+            except +libcudf_exception_handler
+
         json_reader_options build() except +libcudf_exception_handler
 
     cdef cudf_io_types.table_with_metadata read_json(
@@ -118,6 +167,8 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_rows_per_chunk() except +libcudf_exception_handler
         string get_true_value() except +libcudf_exception_handler
         string get_false_value() except +libcudf_exception_handler
+        cudf_io_types.compression_type get_compression()\
+            except +libcudf_exception_handler
 
         # setter
         void set_table(
@@ -132,6 +183,9 @@ cdef extern from "cudf/io/json.hpp" \
         void set_rows_per_chunk(size_type val) except +libcudf_exception_handler
         void set_true_value(string val) except +libcudf_exception_handler
         void set_false_value(string val) except +libcudf_exception_handler
+        void set_compression(
+            cudf_io_types.compression_type comptype
+        ) except +libcudf_exception_handler
 
         @staticmethod
         json_writer_options_builder builder(
@@ -169,6 +223,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_writer_options_builder& false_value(
             string val
         ) except +libcudf_exception_handler
+        json_writer_options_builder& compression(
+            cudf_io_types.compression_type comptype
+        ) except +libcudf_exception_handler
 
         json_writer_options build() except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 8570531dfde..9d1e8cba425 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -11,18 +11,6 @@ from pylibcudf.libcudf.types cimport size_type
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const numeric_scalar[uint32_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash_permuted(
         const column_view &strings,
         const uint32_t seed,
         const column_view &a,
@@ -31,31 +19,9 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
     ) except +
 
     cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const column_view &seeds,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64(
-        const column_view &strings,
-        const numeric_scalar[uint64_t] seed,
-        const size_type width,
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] minhash64_permuted(
         const column_view &strings,
         const uint64_t seed,
         const column_view &a,
         const column_view &b,
         const size_type width,
     ) except +
-
-    cdef unique_ptr[column] word_minhash(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
-
-    cdef unique_ptr[column] word_minhash64(
-        const column_view &input,
-        const column_view &seeds
-    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 6b544282f44..0af53748cdc 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -9,9 +9,7 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -19,16 +17,10 @@ cpdef Column minhash_permuted(
     size_type width
 )
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
     Column b,
     size_type width
 )
-
-cpdef Column word_minhash(Column input, Column seeds)
-
-cpdef Column word_minhash64(Column input, Column seeds)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
index a2d9b6364f7..5d88cfbbea0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -1,13 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
-from pylibcudf.scalar import Scalar
 
 def minhash(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
 def minhash64(
-    input: Column, seeds: Column | Scalar, width: int = 4
+    input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
-def word_minhash(input: Column, seeds: Column) -> Column: ...
-def word_minhash64(input: Column, seeds: Column) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 5448cc6de9b..84811cda867 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -8,69 +8,15 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
-    minhash64_permuted as cpp_minhash64_permuted,
-    minhash_permuted as cpp_minhash_permuted,
-    word_minhash as cpp_word_minhash,
-    word_minhash64 as cpp_word_minhash64,
 )
-from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
-from pylibcudf.scalar cimport Scalar
-
-from cython.operator import dereference
-import warnings
 
 __all__ = [
     "minhash",
     "minhash64",
-    "word_minhash",
-    "word_minhash64",
 ]
 
-cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint32_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash_permuted(
+cpdef Column minhash(
     Column input,
     uint32_t seed,
     Column a,
@@ -81,7 +27,7 @@ cpdef Column minhash_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash_permuted`.
+    For details, see :cpp:func:`minhash`.
 
     Parameters
     ----------
@@ -104,7 +50,7 @@ cpdef Column minhash_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash_permuted(
+        c_result = cpp_minhash(
             input.view(),
             seed,
             a.view(),
@@ -114,50 +60,7 @@ cpdef Column minhash_permuted(
 
     return Column.from_libcudf(move(c_result))
 
-cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
-    """
-    Returns the minhash values for each string per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm.
-
-    For details, see :cpp:func:`minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Strings column to compute minhash
-    seeds : Column or Scalar
-        Seed value(s) used for the hash algorithm.
-    width : size_type
-        Character width used for apply substrings;
-        Default is 4 characters.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    warnings.warn(
-        "Starting in version 25.02, the signature of this function will "
-        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
-        FutureWarning
-    )
-
-    cdef unique_ptr[column] c_result
-
-    if not isinstance(seeds, (Column, Scalar)):
-        raise TypeError("Must pass a Column or Scalar")
-
-    with nogil:
-        c_result = cpp_minhash64(
-            input.view(),
-            seeds.view() if ColumnOrScalar is Column else
-            dereference(<numeric_scalar[uint64_t]*>seeds.c_obj.get()),
-            width
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column minhash64_permuted(
+cpdef Column minhash64(
     Column input,
     uint64_t seed,
     Column a,
@@ -168,7 +71,7 @@ cpdef Column minhash64_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x64_128 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash64_permuted`.
+    For details, see :cpp:func:`minhash64`.
 
     Parameters
     ----------
@@ -191,7 +94,7 @@ cpdef Column minhash64_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_minhash64_permuted(
+        c_result = cpp_minhash64(
             input.view(),
             seed,
             a.view(),
@@ -200,62 +103,3 @@ cpdef Column minhash64_permuted(
         )
 
     return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x86_32 for the hash algorithm.
-
-    For details, see :cpp:func:`word_minhash`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
-
-cpdef Column word_minhash64(Column input, Column seeds):
-    """
-    Returns the minhash values for each row of strings per seed.
-    This function uses MurmurHash3_x64_128 for the hash algorithm though
-    only the first 64-bits of the hash are used in computing the output.
-
-    For details, see :cpp:func:`word_minhash64`.
-
-    Parameters
-    ----------
-    input : Column
-        Lists column of strings to compute minhash
-    seeds : Column or Scalar
-        Seed values used for the hash algorithm.
-
-    Returns
-    -------
-    Column
-        List column of minhash values for each string per seed
-    """
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = cpp_word_minhash64(
-            input.view(),
-            seeds.view()
-        )
-
-    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 3d9d99ffa61..bda8921b62a 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -98,10 +98,15 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
     buffer.seek(0)
 
     res = plc.io.avro.read_avro(
-        plc.io.types.SourceInfo([buffer]),
-        columns=columns,
-        skip_rows=skip_rows,
-        num_rows=num_rows,
+        (
+            plc.io.avro.AvroReaderOptions.builder(
+                plc.io.types.SourceInfo([buffer])
+            )
+            .columns(columns)
+            .skip_rows(skip_rows)
+            .num_rows(num_rows)
+            .build()
+        )
     )
 
     expected = pa.Table.from_arrays(
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
index 453e5ce32a8..747bbfa1370 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -24,13 +24,19 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
     plc_table_w_meta, pa_table = table_data
     sink = source_or_sink
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_table_w_meta,
-        lines=lines,
-        rows_per_chunk=rows_per_chunk,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_table_w_meta.tbl
+        )
+        .metadata(plc_table_w_meta)
+        .lines(lines)
+        .build()
     )
 
+    options.set_rows_per_chunk(rows_per_chunk)
+
+    plc.io.json.write_json(options)
+
     exp = pa_table.to_pandas()
 
     # Convert everything to string to make
@@ -57,13 +63,18 @@ def test_write_json_nulls(na_rep, include_nulls):
 
     sink = io.StringIO()
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_tbl_w_meta,
-        na_rep=na_rep,
-        include_nulls=include_nulls,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl
+        )
+        .metadata(plc_tbl_w_meta)
+        .na_rep(na_rep)
+        .include_nulls(include_nulls)
+        .build()
     )
 
+    plc.io.json.write_json(options)
+
     exp = pa_tbl.to_pandas()
 
     # Convert everything to string to make
@@ -100,15 +111,21 @@ def test_write_json_bool_opts(true_value, false_value):
 
     sink = io.StringIO()
 
-    plc.io.json.write_json(
-        plc.io.SinkInfo([sink]),
-        plc_tbl_w_meta,
-        include_nulls=True,
-        na_rep="null",
-        true_value=true_value,
-        false_value=false_value,
+    options = (
+        plc.io.json.JsonWriterOptions.builder(
+            plc.io.SinkInfo([sink]), plc_tbl_w_meta.tbl
+        )
+        .metadata(plc_tbl_w_meta)
+        .na_rep("null")
+        .include_nulls(True)
+        .build()
     )
 
+    options.set_true_value(true_value)
+    options.set_false_value(false_value)
+
+    plc.io.json.write_json(options)
+
     exp = pa_tbl.to_pandas()
 
     # Convert everything to string to make
@@ -150,9 +167,12 @@ def test_read_json_basic(
         source.seek(0)
 
     res = plc.io.json.read_json(
-        plc.io.SourceInfo([source]),
-        compression=compression_type,
-        lines=lines,
+        (
+            plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+            .compression(compression_type)
+            .lines(lines)
+            .build()
+        )
     )
 
     # Adjustments to correct for the fact orient=records is lossy
@@ -226,9 +246,14 @@ def get_child_types(typ):
 
     new_schema = pa.schema(new_fields)
 
-    res = plc.io.json.read_json(
-        plc.io.SourceInfo([source]), dtypes=dtypes, lines=True
+    options = (
+        plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+        .lines(True)
+        .build()
     )
+    options.set_dtypes(dtypes)
+
+    res = plc.io.json.read_json(options)
     new_table = pa_table.cast(new_schema)
 
     # orient=records is lossy
@@ -252,10 +277,15 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size):
     for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
         tbls_w_meta.append(
             plc.io.json.read_json(
-                plc.io.SourceInfo([source]),
-                lines=True,
-                byte_range_offset=chunk_start,
-                byte_range_size=chunk_start + chunk_size,
+                (
+                    plc.io.json.JsonReaderOptions.builder(
+                        plc.io.SourceInfo([source])
+                    )
+                    .lines(True)
+                    .byte_range_offset(chunk_start)
+                    .byte_range_size(chunk_start + chunk_size)
+                    .build()
+                )
             )
         )
 
@@ -285,7 +315,12 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
     write_source_str(source, json_bytes)
 
     tbl_w_meta = plc.io.json.read_json(
-        plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
+        (
+            plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+            .lines(True)
+            .keep_quotes(keep_quotes)
+            .build()
+        )
     )
 
     template = "{0}"
@@ -313,20 +348,19 @@ def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
     json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
     write_source_str(source, json_str)
 
+    options = (
+        plc.io.json.JsonReaderOptions.builder(plc.io.SourceInfo([source]))
+        .lines(True)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
     if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
         with pytest.raises(RuntimeError):
-            plc.io.json.read_json(
-                plc.io.SourceInfo([source]),
-                lines=True,
-                recovery_mode=recovery_mode,
-            )
+            plc.io.json.read_json(options)
     else:
         # Recover case (bad values replaced with nulls)
-        tbl_w_meta = plc.io.json.read_json(
-            plc.io.SourceInfo([source]),
-            lines=True,
-            recovery_mode=recovery_mode,
-        )
+        tbl_w_meta = plc.io.json.read_json(options)
         exp = pa.Table.from_arrays(
             [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"]
         )
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
index 2557e40c935..fe35255505c 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_orc.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py
@@ -37,12 +37,17 @@ def test_read_orc_basic(
         binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS
     )
 
-    res = plc.io.orc.read_orc(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.orc.OrcReaderOptions.builder(
+        plc.io.types.SourceInfo([source])
+    ).build()
+    if nrows >= 0:
+        options.set_num_rows(nrows)
+    if skiprows >= 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None and len(columns) > 0:
+        options.set_columns(columns)
+
+    res = plc.io.orc.read_orc(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)
diff --git a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index 94524acbcc8..da535809745 100644
--- a/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -31,19 +31,24 @@ def test_read_parquet_basic(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    res = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]),
-        nrows=nrows,
-        skip_rows=skiprows,
-        columns=columns,
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    if nrows > -1:
+        options.set_num_rows(nrows)
+    if skiprows != 0:
+        options.set_skip_rows(skiprows)
+    if columns is not None:
+        options.set_columns(columns)
+
+    res = plc.io.parquet.read_parquet(options)
 
     if columns is not None:
         pa_table = pa_table.select(columns)
 
     # Adapt to nrows/skiprows
     pa_table = pa_table.slice(
-        offset=skiprows, length=nrows if nrows != -1 else None
+        offset=skiprows, length=nrows if nrows > -1 else None
     )
 
     assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
@@ -95,9 +100,12 @@ def test_read_parquet_filters(
         binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
     )
 
-    plc_table_w_meta = plc.io.parquet.read_parquet(
-        plc.io.SourceInfo([source]), filters=plc_filters
-    )
+    options = plc.io.parquet.ParquetReaderOptions.builder(
+        plc.io.SourceInfo([source])
+    ).build()
+    options.set_filter(plc_filters)
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(options)
     exp = read_table(source, filters=pa_filters)
     assert_table_and_meta_eq(
         exp, plc_table_w_meta, check_field_nullability=False
diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
index 83fb50fa4ef..7096dbe14ff 100644
--- a/python/pylibcudf/pylibcudf/tests/test_hashing.py
+++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import hashlib
 import struct
@@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0):
 
 
 def hash_combine_32(lhs, rhs):
-    return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))
+    return np.uint32(
+        int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32
+    )
 
 
 def uint_hash_combine_32(lhs, rhs):
@@ -80,22 +82,6 @@ def list_struct_table():
     return data
 
 
-def python_hash_value(x, method):
-    if method == "murmurhash3_x86_32":
-        return libcudf_mmh3_x86_32(x)
-    elif method == "murmurhash3_x64_128":
-        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
-        hasher.update(x)
-        # libcudf returns a tuple of two 64-bit integers
-        return hasher.utupledigest()
-    elif method == "xxhash_64":
-        return xxhash.xxh64(
-            x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
-        ).intdigest()
-    else:
-        return getattr(hashlib, method)(x).hexdigest()
-
-
 @pytest.mark.parametrize(
     "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"]
 )
@@ -115,6 +101,23 @@ def py_hasher(val):
     assert_column_eq(got, expect)
 
 
+def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return xxhash.xxh32(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.xxhash_32(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+
+    assert_column_eq(got, expect)
+
+
 def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl):
     def py_hasher(val):
         return xxhash.xxh64(
@@ -125,7 +128,9 @@ def py_hasher(val):
         [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
         type=pa.uint64(),
     )
-    got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0)
+    got = plc.hashing.xxhash_64(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
 
     assert_column_eq(got, expect)
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
index af80b6e5978..ca42eacdfdb 100644
--- a/python/pylibcudf/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -40,6 +40,28 @@ def test_struct_dtype_roundtrip():
     assert arrow_type == struct_type
 
 
+def test_table_with_nested_dtype_to_arrow():
+    pa_array = pa.array([[{"": 1}]])
+    plc_table = plc.Table([plc.interop.from_arrow(pa_array)])
+    result = plc.interop.to_arrow(plc_table)
+    expected_schema = pa.schema(
+        [
+            pa.field(
+                "",
+                pa.list_(
+                    pa.field(
+                        "",
+                        pa.struct([pa.field("", pa.int64(), nullable=False)]),
+                        nullable=False,
+                    )
+                ),
+                nullable=False,
+            )
+        ]
+    )
+    assert result.schema == expected_schema
+
+
 def test_decimal128_roundtrip():
     decimal_type = pa.decimal128(10, 2)
     plc_type = plc.interop.from_arrow(decimal_type)
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ec533e64307..ad7a6f7a762 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -13,20 +13,13 @@ def minhash_input_data(request):
     return input_arr, seeds, request.param
 
 
-@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
-def word_minhash_input_data(request):
-    input_arr = pa.array([["foo", "bar"], ["foo foo", "bar bar"]])
-    seeds = pa.array([2, 3, 4, 5], request.param)
-    return input_arr, seeds, request.param
-
-
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash_permuted(minhash_input_data, width):
+def test_minhash(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash_permuted
+        plc.nvtext.minhash.minhash
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64_permuted
+        else plc.nvtext.minhash.minhash64
     )
     result = minhash_func(
         plc.interop.from_arrow(input_arr),
@@ -40,20 +33,3 @@ def test_minhash_permuted(minhash_input_data, width):
     assert pa_result.type == pa.list_(
         pa.field("element", seed_type, nullable=False)
     )
-
-
-def test_word_minhash(word_minhash_input_data):
-    input_arr, seeds, seed_type = word_minhash_input_data
-    word_minhash_func = (
-        plc.nvtext.minhash.word_minhash
-        if seed_type == pa.uint32()
-        else plc.nvtext.minhash.word_minhash64
-    )
-    result = word_minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds)
-    )
-    pa_result = plc.interop.to_arrow(result)
-    assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))
-    assert pa_result.type == pa.list_(
-        pa.field("element", seed_type, nullable=False)
-    )
diff --git a/python/pylibcudf/pylibcudf/utils.pyx b/python/pylibcudf/pylibcudf/utils.pyx
index ee4421ddeaf..e9365ca1b36 100644
--- a/python/pylibcudf/pylibcudf/utils.pyx
+++ b/python/pylibcudf/pylibcudf/utils.pyx
@@ -5,7 +5,7 @@ from cython.operator import dereference
 from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
-from cuda import cudart
+from cuda.bindings import runtime
 
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport bitmask_type
@@ -44,13 +44,13 @@ def _is_concurrent_managed_access_supported():
     """
 
     # Ensure CUDA is initialized before checking cudaDevAttrConcurrentManagedAccess
-    cudart.cudaFree(0)
+    runtime.cudaFree(0)
 
     device_id = 0
-    err, supports_managed_access = cudart.cudaDeviceGetAttribute(
-        cudart.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess, device_id
+    err, supports_managed_access = runtime.cudaDeviceGetAttribute(
+        runtime.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess, device_id
     )
-    if err != cudart.cudaError_t.cudaSuccess:
+    if err != runtime.cudaError_t.cudaSuccess:
         raise RuntimeError(
             f"Failed to check cudaDevAttrConcurrentManagedAccess with error {err}"
         )
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index a5e5704b8ed..e0055d5ebf8 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "cuda-python>=11.7.1,<12.0a0",
+    "cuda-python>=11.8.5,<12.0a0",
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
@@ -99,8 +99,6 @@ addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib"
 empty_parameter_set_mark = "fail_at_collect"
 filterwarnings = [
   "error",
-  # https://github.com/rapidsai/build-planning/issues/116
-  "ignore:.*cuda..* module is deprecated.*:DeprecationWarning",
   "ignore:::.*xdist.*",
   "ignore:::.*pytest.*"
 ]