diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 5e2f46714d9..e0b315f34fc 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -8,10 +8,9 @@ notebooks/         @rapidsai/cudf-python-codeowners
 python/dask_cudf/  @rapidsai/cudf-dask-codeowners
 
 #cmake code owners
-cpp/CMakeLists.txt               @rapidsai/cudf-cmake-codeowners
-cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
-**/cmake/                        @rapidsai/cudf-cmake-codeowners
-*.cmake                          @rapidsai/cudf-cmake-codeowners
+CMakeLists.txt @rapidsai/cudf-cmake-codeowners
+**/cmake/      @rapidsai/cudf-cmake-codeowners
+*.cmake        @rapidsai/cudf-cmake-codeowners
 
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index af49942c8cd..d80e4fef0d0 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -1,11 +1,13 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_name="libcudf"
 package_dir="python/libcudf"
 
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
 rapids-logger "Generating build requirements"
 
 rapids-dependency-file-generator \
@@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0
 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON"
 ./ci/build_wheel.sh "${package_name}" "${package_dir}"
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
 mkdir -p ${package_dir}/final_dist
 python -m auditwheel repair \
     --exclude libnvcomp.so.4 \
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index db86721755d..3c6dba72164 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 # Support invoking test_python_cudf.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
@@ -24,8 +24,8 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
-rapids-logger "pytest dask_cudf (dask-expr)"
-DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
+rapids-logger "pytest dask_cudf"
+./ci/run_dask_cudf_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
@@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \
   --cov-report=term
 
-rapids-logger "pytest dask_cudf (legacy)"
-DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-
 rapids-logger "pytest cudf_kafka"
 ./ci/run_cudf_kafka_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index e15949f4bdb..44f430ce98d 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 
 set -eou pipefail
 
@@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 # Run tests in dask_cudf/tests and dask_cudf/io/tests
-rapids-logger "pytest dask_cudf (dask-expr)"
+rapids-logger "pytest dask_cudf"
 pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \
+python -m pytest \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \
   --numprocesses=8 \
   --dist=worksteal \
   .
 popd
-
-# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy)
-rapids-logger "pytest dask_cudf (legacy)"
-pushd python/dask_cudf/dask_cudf
-DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \
-  --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \
-  --numprocesses=8 \
-  --dist=worksteal \
-  .
-popd
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cb814aa8c0f..9dabe4e8800 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -276,7 +276,7 @@ rapids_cpm_init()
 
 include(${rapids-cmake-dir}/cpm/rapids_logger.cmake)
 rapids_cpm_rapids_logger()
-rapids_make_logger(cudf EXPORT_SET cudf-exports)
+rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN)
 
 # find jitify
 include(cmake/thirdparty/get_jitify.cmake)
@@ -461,6 +461,7 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
+  src/hash/xxhash_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/arrow_utilities.cpp
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 3502cbcea2a..1085b03ac7b 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,13 +23,8 @@ void distinct_inner_join(nvbench::state& state,
   auto join = [](cudf::table_view const& probe_input,
                  cudf::table_view const& build_input,
                  cudf::null_equality compare_nulls) {
-    auto const has_nulls =
-      cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
-        ? cudf::nullable_join::YES
-        : cudf::nullable_join::NO;
-    auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls};
-    return hj_obj.inner_join();
+    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls};
+    return hj_obj.inner_join(probe_input);
   };
 
   BM_join<Key, Nullable>(state, join);
@@ -42,13 +37,8 @@ void distinct_left_join(nvbench::state& state,
   auto join = [](cudf::table_view const& probe_input,
                  cudf::table_view const& build_input,
                  cudf::null_equality compare_nulls) {
-    auto const has_nulls =
-      cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
-        ? cudf::nullable_join::YES
-        : cudf::nullable_join::NO;
-    auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls};
-    return hj_obj.left_join();
+    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls};
+    return hj_obj.left_join(probe_input);
   };
 
   BM_join<Key, Nullable>(state, join);
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 2acc10105cf..9a10163eb15 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,19 +36,24 @@ using cudf::experimental::row::lhs_index_type;
 using cudf::experimental::row::rhs_index_type;
 
 /**
- * @brief An comparator adapter wrapping both self comparator and two table comparator
+ * @brief A custom comparator used for the build table insertion
  */
-template <typename Equal>
-struct comparator_adapter {
-  comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
-
-  __device__ constexpr auto operator()(
+struct always_not_equal {
+  __device__ constexpr bool operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
   {
     // All build table keys are distinct thus `false` no matter what
     return false;
   }
+};
+
+/**
+ * @brief An comparator adapter wrapping the two table comparator
+ */
+template <typename Equal>
+struct comparator_adapter {
+  comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const& lhs,
@@ -62,56 +67,14 @@ struct comparator_adapter {
   Equal _d_equal;
 };
 
-template <typename Hasher>
-struct hasher_adapter {
-  hasher_adapter(Hasher const& d_hasher = {}) : _d_hasher{d_hasher} {}
-
-  template <typename T>
-  __device__ constexpr auto operator()(cuco::pair<hash_value_type, T> const& key) const noexcept
-  {
-    return _d_hasher(key.first);
-  }
-
- private:
-  Hasher _d_hasher;
-};
-
 /**
  * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
  * `*_join` member functions.
  *
- * @tparam HasNested Flag indicating whether there are nested columns in build/probe table
+ * This class enables the distinct hash join scheme that builds hash table once, and probes as many
+ * times as needed (possibly in parallel).
  */
-template <cudf::has_nested HasNested>
-struct distinct_hash_join {
- private:
-  /// Device row equal type
-  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
-                                                             cudf::nullate::DYNAMIC>>;
-  using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
-  using probing_scheme_type = cuco::linear_probing<1, hasher>;
-  using cuco_storage_type   = cuco::storage<1>;
-
-  /// Hash table type
-  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
-                                           cuco::extent<size_type>,
-                                           cuda::thread_scope_device,
-                                           comparator_adapter<d_equal_type>,
-                                           probing_scheme_type,
-                                           cudf::detail::cuco_allocator<char>,
-                                           cuco_storage_type>;
-
-  bool _has_nulls;  ///< true if nulls are present in either build table or probe table
-  cudf::null_equality _nulls_equal;  ///< whether to consider nulls as equal
-  cudf::table_view _build;           ///< input table to build the hash map
-  cudf::table_view _probe;           ///< input table to probe the hash map
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_build;  ///< input table preprocssed for row operators
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
-    _preprocessed_probe;        ///< input table preprocssed for row operators
-  hash_table_type _hash_table;  ///< hash table built on `_build`
-
+class distinct_hash_join {
  public:
   distinct_hash_join()                                     = delete;
   ~distinct_hash_join()                                    = default;
@@ -120,21 +83,28 @@ struct distinct_hash_join {
   distinct_hash_join& operator=(distinct_hash_join const&) = delete;
   distinct_hash_join& operator=(distinct_hash_join&&)      = delete;
 
+  /**
+   * @brief Hasher adapter used by distinct hash join
+   */
+  struct hasher {
+    template <typename T>
+    __device__ constexpr hash_value_type operator()(
+      cuco::pair<hash_value_type, T> const& key) const noexcept
+    {
+      return key.first;
+    }
+  };
+
   /**
    * @brief Constructor that internally builds the hash table based on the given `build` table.
    *
    * @throw cudf::logic_error if the number of columns in `build` table is 0.
    *
    * @param build The build table, from which the hash table is built
-   * @param probe The probe table
-   * @param has_nulls Flag to indicate if any nulls exist in the `build` table or
-   *        any `probe` table that will be used later for join.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   distinct_hash_join(cudf::table_view const& build,
-                     cudf::table_view const& probe,
-                     bool has_nulls,
                      cudf::null_equality compare_nulls,
                      rmm::cuda_stream_view stream);
 
@@ -143,12 +113,36 @@ struct distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
+  inner_join(cudf::table_view const& probe,
+             rmm::cuda_stream_view stream,
+             rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
+    cudf::table_view const& probe,
+    rmm::cuda_stream_view stream,
+    rmm::device_async_resource_ref mr) const;
+
+ private:
+  using probing_scheme_type = cuco::linear_probing<1, hasher>;
+  using cuco_storage_type   = cuco::storage<1>;
+
+  /// Hash table type
+  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
+                                           cuco::extent<size_type>,
+                                           cuda::thread_scope_device,
+                                           always_not_equal,
+                                           probing_scheme_type,
+                                           cudf::detail::cuco_allocator<char>,
+                                           cuco_storage_type>;
+
+  bool _has_nested_columns;  ///< True if nested columns are present in build and probe tables
+  cudf::null_equality _nulls_equal;  ///< Whether to consider nulls as equal
+  cudf::table_view _build;           ///< Input table to build the hash map
+  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table>
+    _preprocessed_build;        ///< Input table preprocssed for row operators
+  hash_table_type _hash_table;  ///< Hash table built on `_build`
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index ea2f5d4b6ca..5edbb322231 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -60,7 +60,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 };
  * @return `true` if the type is supported by `fixed_point` implementation
  */
 template <typename T>
-constexpr inline auto is_supported_representation_type()
+CUDF_HOST_DEVICE constexpr inline auto is_supported_representation_type()
 {
   return cuda::std::is_same_v<T, int32_t> ||  //
          cuda::std::is_same_v<T, int64_t> ||  //
@@ -72,6 +72,24 @@ constexpr inline auto is_supported_representation_type()
 // Helper functions for `fixed_point` type
 namespace detail {
 
+/**
+ * @brief Returns the smaller of the given scales
+ *
+ * @param a The left-hand side value to compare
+ * @param b The right-hand side value to compare
+ * @return The smaller of the given scales
+ */
+CUDF_HOST_DEVICE constexpr inline scale_type min(scale_type const& a, scale_type const& b)
+{
+  // TODO This is a temporary workaround because <cuda/std/functional> is not self-contained when
+  // built with NVRTC 11.8. Replace this with cuda::std::min once the underlying issue is resolved.
+#ifdef __CUDA_ARCH__
+  return scale_type{min(static_cast<int>(a), static_cast<int>(b))};
+#else
+  return std::min(a, b);
+#endif
+}
+
 /**
  * @brief A function for integer exponentiation by squaring.
  *
@@ -267,12 +285,12 @@ class fixed_point {
    * @return The `fixed_point` number in base 10 (aka human readable format)
    */
   template <typename U, typename cuda::std::enable_if_t<cuda::std::is_integral_v<U>>* = nullptr>
-  explicit constexpr operator U() const
+  CUDF_HOST_DEVICE explicit constexpr operator U() const
   {
     // Cast to the larger of the two types (of U and Rep) before converting to Rep because in
     // certain cases casting to U before shifting will result in integer overflow (i.e. if U =
     // int32_t, Rep = int64_t and _value > 2 billion)
-    auto const value = std::common_type_t<U, Rep>(_value);
+    auto const value = cuda::std::common_type_t<U, Rep>(_value);
     return static_cast<U>(detail::shift<Rep, Rad>(value, scale_type{-_scale}));
   }
 
@@ -669,7 +687,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator+(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   auto const sum   = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value;
 
 #if defined(__CUDACC_DEBUG__)
@@ -687,7 +705,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator-(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   auto const diff  = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value;
 
 #if defined(__CUDACC_DEBUG__)
@@ -735,7 +753,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator==(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value;
 }
 
@@ -744,7 +762,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator!=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value;
 }
 
@@ -753,7 +771,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator<=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value;
 }
 
@@ -762,7 +780,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator>=(fixed_point<Rep1, Rad1> const& lhs,
                                         fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value;
 }
 
@@ -771,7 +789,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator<(fixed_point<Rep1, Rad1> const& lhs,
                                        fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value;
 }
 
@@ -780,7 +798,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline bool operator>(fixed_point<Rep1, Rad1> const& lhs,
                                        fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale = std::min(lhs._scale, rhs._scale);
+  auto const scale = detail::min(lhs._scale, rhs._scale);
   return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value;
 }
 
@@ -789,7 +807,7 @@ template <typename Rep1, Radix Rad1>
 CUDF_HOST_DEVICE inline fixed_point<Rep1, Rad1> operator%(fixed_point<Rep1, Rad1> const& lhs,
                                                           fixed_point<Rep1, Rad1> const& rhs)
 {
-  auto const scale     = std::min(lhs._scale, rhs._scale);
+  auto const scale     = detail::min(lhs._scale, rhs._scale);
   auto const remainder = lhs.rescaled(scale)._value % rhs.rescaled(scale)._value;
   return fixed_point<Rep1, Rad1>{scaled_integer<Rep1>{remainder, scale}};
 }
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 307a52cd242..88034b4f804 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -166,6 +166,26 @@ std::unique_ptr<column> sha512(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Computes the XXHash_32 hash value of each row in the given table
+ *
+ * This function computes the hash of each column using the `seed` for the first column
+ * and the resulting hash as a seed for the next column and so on.
+ * The result is a uint32 value for each row.
+ *
+ * @param input The table of columns to hash
+ * @param seed Optional seed value to use for the hash function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ *
+ * @returns A column where each row is the hash of a row from the input
+ */
+std::unique_ptr<column> xxhash_32(
+  table_view const& input,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 7cb80081a95..f796ff4526e 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,6 +61,11 @@ std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr);
 
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint64_t seed,
+                                  rmm::cuda_stream_view,
+                                  rmm::device_async_resource_ref mr);
+
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
new file mode 100644
index 00000000000..bb6e7f18fbc
--- /dev/null
+++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/hashing.hpp>
+#include <cudf/hashing/detail/hash_functions.cuh>
+#include <cudf/lists/list_view.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/structs/struct_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cuco/hash_functions.cuh>
+#include <cuda/std/cstddef>
+
+namespace cudf::hashing::detail {
+
+template <typename Key>
+struct XXHash_32 {
+  using result_type = std::uint32_t;
+
+  CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {}
+
+  __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); }
+
+  __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes,
+                                                 std::uint64_t size) const
+  {
+    return this->_impl.compute_hash(bytes, size);
+  }
+
+ private:
+  template <typename T>
+  __device__ constexpr result_type compute(T const& key) const
+  {
+    return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(T));
+  }
+
+  cuco::xxhash_32<Key> _impl;
+};
+
+template <>
+XXHash_32<bool>::result_type __device__ inline XXHash_32<bool>::operator()(bool const& key) const
+{
+  return this->compute(static_cast<uint8_t>(key));
+}
+
+template <>
+XXHash_32<float>::result_type __device__ inline XXHash_32<float>::operator()(float const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<double>::result_type __device__ inline XXHash_32<double>::operator()(
+  double const& key) const
+{
+  return this->compute(normalize_nans_and_zeros(key));
+}
+
+template <>
+XXHash_32<cudf::string_view>::result_type
+  __device__ inline XXHash_32<cudf::string_view>::operator()(cudf::string_view const& key) const
+{
+  return this->compute_bytes(reinterpret_cast<cuda::std::byte const*>(key.data()),
+                             key.size_bytes());
+}
+
+template <>
+XXHash_32<numeric::decimal32>::result_type
+  __device__ inline XXHash_32<numeric::decimal32>::operator()(numeric::decimal32 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal64>::result_type
+  __device__ inline XXHash_32<numeric::decimal64>::operator()(numeric::decimal64 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<numeric::decimal128>::result_type
+  __device__ inline XXHash_32<numeric::decimal128>::operator()(numeric::decimal128 const& key) const
+{
+  return this->compute(key.value());
+}
+
+template <>
+XXHash_32<cudf::list_view>::result_type __device__ inline XXHash_32<cudf::list_view>::operator()(
+  cudf::list_view const& key) const
+{
+  CUDF_UNREACHABLE("List column hashing is not supported");
+}
+
+template <>
+XXHash_32<cudf::struct_view>::result_type
+  __device__ inline XXHash_32<cudf::struct_view>::operator()(cudf::struct_view const& key) const
+{
+  CUDF_UNREACHABLE("Direct hashing of struct_view is not supported");
+}
+
+}  // namespace cudf::hashing::detail
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index afefd04d4fa..cc63565eee1 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,13 +34,6 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @brief Enum to indicate whether the distinct join table has nested columns or not
- *
- * @ingroup column_join
- */
-enum class has_nested : bool { YES, NO };
-
 // forward declaration
 namespace hashing::detail {
 
@@ -61,7 +54,6 @@ class hash_join;
 /**
  * @brief Forward declaration for our distinct hash join
  */
-template <cudf::has_nested HasNested>
 class distinct_hash_join;
 }  // namespace detail
 
@@ -469,20 +461,19 @@ class hash_join {
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
-  const std::unique_ptr<impl_type const> _impl;
+  std::unique_ptr<impl_type const> _impl;
 };
 
 /**
  * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
  * `*_join` member functions
  *
+ * This class enables the distinct hash join scheme that builds hash table once, and probes as many
+ * times as needed (possibly in parallel).
+ *
  * @note Behavior is undefined if the build table contains duplicates.
  * @note All NaNs are considered as equal
- *
- * @tparam HasNested Flag indicating whether there are nested columns in build/probe table
  */
-// TODO: `HasNested` to be removed via dispatching
-template <cudf::has_nested HasNested>
 class distinct_hash_join {
  public:
   distinct_hash_join() = delete;
@@ -496,15 +487,10 @@ class distinct_hash_join {
    * @brief Constructs a distinct hash join object for subsequent probe calls
    *
    * @param build The build table that contains distinct elements
-   * @param probe The probe table, from which the keys are probed
-   * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or
-   *        any `probe` table that will be used later for join
    * @param compare_nulls Controls whether null join-key values should match or not
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
   distinct_hash_join(cudf::table_view const& build,
-                     cudf::table_view const& probe,
-                     nullable_join has_nulls      = nullable_join::YES,
                      null_equality compare_nulls  = null_equality::EQUAL,
                      rmm::cuda_stream_view stream = cudf::get_default_stream());
 
@@ -512,16 +498,18 @@ class distinct_hash_join {
    * @brief Returns the row indices that can be used to construct the result of performing
    * an inner join between two tables. @see cudf::inner_join().
    *
+   * @param probe The probe table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned indices' device memory.
    *
-   * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to
+   * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to
    * construct the result of performing an inner join between two tables
    * with `build` and `probe` as the join keys.
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  inner_join(cudf::table_view const& probe,
+             rmm::cuda_stream_view stream      = cudf::get_default_stream(),
              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
@@ -532,19 +520,22 @@ class distinct_hash_join {
    * the row index of the matched row from the build table if there is a match. Otherwise, contains
    * `JoinNoneValue`.
    *
+   * @param probe The probe table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
+   *
    * @return A `build_indices` column that can be used to construct the result of
    * performing a left join between two tables with `build` and `probe` as the join
    * keys.
    */
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
+    cudf::table_view const& probe,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
-  using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
+  using impl_type = cudf::detail::distinct_hash_join;  ///< Implementation type
 
   std::unique_ptr<impl_type> _impl;  ///< Distinct hash join implementation
 };
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 6351a84e38f..c1dd79ef14f 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf {
  * @return The `cudf::type_id` corresponding to the specified type
  */
 template <typename T>
-inline constexpr type_id type_to_id()
+CUDF_HOST_DEVICE inline constexpr type_id type_to_id()
 {
   return type_id::EMPTY;
 };
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 0e31a0b6cf5..2f255e7a07c 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/type_traits>
+
 namespace cudf {
 namespace binops {
 namespace compiled {
@@ -51,7 +53,7 @@ struct type_casted_accessor {
   {
     if constexpr (column_device_view::has_element_accessor<Element>()) {
       auto const element = col.element<Element>(is_scalar ? 0 : i);
-      if constexpr (std::is_convertible_v<Element, CastType>) {
+      if constexpr (cuda::std::is_convertible_v<Element, CastType>) {
         return static_cast<CastType>(element);
       } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
         return convert_fixed_to_floating<CastType>(element);
@@ -75,7 +77,7 @@ struct typed_casted_writer {
                                     FromType val) const
   {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
-                  std::is_constructible_v<Element, FromType>) {
+                  cuda::std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
     } else if constexpr (is_fixed_point<Element>()) {
       auto const scale = numeric::scale_type{col.type().scale()};
@@ -109,18 +111,18 @@ struct ops_wrapper {
   template <typename TypeCommon>
   __device__ void operator()(size_type i)
   {
-    if constexpr (std::is_invocable_v<BinaryOperator, TypeCommon, TypeCommon>) {
+    if constexpr (cuda::std::is_invocable_v<BinaryOperator, TypeCommon, TypeCommon>) {
       TypeCommon x =
         type_dispatcher(lhs.type(), type_casted_accessor<TypeCommon>{}, i, lhs, is_lhs_scalar);
       TypeCommon y =
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
-        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
-                      std::is_same_v<BinaryOperator, ops::NullMax> or
-                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+        if constexpr (cuda::std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullNotEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMax> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
           auto result       = BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(
             x,
@@ -134,7 +136,7 @@ struct ops_wrapper {
           return BinaryOperator{}.template operator()<TypeCommon, TypeCommon>(x, y);
         }
         // To suppress nvcc warning
-        return std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
+        return cuda::std::invoke_result_t<BinaryOperator, TypeCommon, TypeCommon>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeCommon, TypeCommon>())
         out.element<decltype(result)>(i) = result;
@@ -161,16 +163,16 @@ struct ops2_wrapper {
   __device__ void operator()(size_type i)
   {
     if constexpr (!has_common_type_v<TypeLhs, TypeRhs> and
-                  std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
+                  cuda::std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>) {
       TypeLhs x   = lhs.element<TypeLhs>(is_lhs_scalar ? 0 : i);
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
-        if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
-                      std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
-                      std::is_same_v<BinaryOperator, ops::NullMax> or
-                      std::is_same_v<BinaryOperator, ops::NullMin>) {
+        if constexpr (cuda::std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullNotEquals> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMax> or
+                      cuda::std::is_same_v<BinaryOperator, ops::NullMin>) {
           bool output_valid = false;
           auto result       = BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(
             x,
@@ -184,7 +186,7 @@ struct ops2_wrapper {
           return BinaryOperator{}.template operator()<TypeLhs, TypeRhs>(x, y);
         }
         // To suppress nvcc warning
-        return std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
+        return cuda::std::invoke_result_t<BinaryOperator, TypeLhs, TypeRhs>{};
       }();
       if constexpr (is_bool_result<BinaryOperator, TypeLhs, TypeRhs>())
         out.element<decltype(result)>(i) = result;
diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu
new file mode 100644
index 00000000000..40503f7f911
--- /dev/null
+++ b/cpp/src/hash/xxhash_32.cu
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/hashing/detail/xxhash_32.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/std/limits>
+#include <thrust/tabulate.h>
+
+namespace cudf {
+namespace hashing {
+namespace detail {
+
+namespace {
+
+/**
+ * @brief Computes the hash value of a row in the given table.
+ *
+ * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
+ */
+template <typename Nullate>
+class device_row_hasher {
+ public:
+  device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed)
+    : _check_nulls(nulls), _table(t), _seed(seed)
+  {
+  }
+
+  __device__ auto operator()(size_type row_index) const noexcept
+  {
+    return cudf::detail::accumulate(
+      _table.begin(),
+      _table.end(),
+      _seed,
+      [row_index, nulls = _check_nulls] __device__(auto hash, auto column) {
+        return cudf::type_dispatcher(
+          column.type(), element_hasher_adapter{}, column, row_index, nulls, hash);
+      });
+  }
+
+  /**
+   * @brief Computes the hash value of an element in the given column.
+   */
+  class element_hasher_adapter {
+   public:
+    template <typename T, CUDF_ENABLE_IF(column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const& col,
+                                          size_type const row_index,
+                                          Nullate const _check_nulls,
+                                          hash_value_type const _seed) const noexcept
+    {
+      if (_check_nulls && col.is_null(row_index)) {
+        return cuda::std::numeric_limits<hash_value_type>::max();
+      }
+      auto const hasher = XXHash_32<T>{_seed};
+      return hasher(col.element<T>(row_index));
+    }
+
+    template <typename T, CUDF_ENABLE_IF(not column_device_view::has_element_accessor<T>())>
+    __device__ hash_value_type operator()(column_device_view const&,
+                                          size_type const,
+                                          Nullate const,
+                                          hash_value_type const) const noexcept
+    {
+      CUDF_UNREACHABLE("Unsupported type for XXHash_32");
+    }
+  };
+
+  Nullate const _check_nulls;
+  table_device_view const _table;
+  hash_value_type const _seed;
+};
+
+}  // namespace
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
+                                    input.num_rows(),
+                                    mask_state::UNALLOCATED,
+                                    stream,
+                                    mr);
+
+  // Return early if there's nothing to hash
+  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
+
+  bool const nullable   = has_nulls(input);
+  auto const input_view = table_device_view::create(input, stream);
+  auto output_view      = output->mutable_view();
+
+  // Compute the hash value for each row
+  thrust::tabulate(rmm::exec_policy(stream),
+                   output_view.begin<hash_value_type>(),
+                   output_view.end<hash_value_type>(),
+                   device_row_hasher(nullable, *input_view, seed));
+
+  return output;
+}
+
+}  // namespace detail
+
+std::unique_ptr<column> xxhash_32(table_view const& input,
+                                  uint32_t seed,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::xxhash_32(input, seed, stream, mr);
+}
+
+}  // namespace hashing
+}  // namespace cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 3a4e315348c..ac81dd421fa 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -416,11 +416,11 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   memo_map_lock.unlock();
 
   if (reason.has_value()) {
-    CUDF_LOG_INFO("nvCOMP is disabled for {} compression; reason: {}",
+    CUDF_LOG_INFO("nvCOMP is disabled for %s compression; reason: %s",
                   compression_type_name(compression),
                   reason.value());
   } else {
-    CUDF_LOG_INFO("nvCOMP is enabled for {} compression", compression_type_name(compression));
+    CUDF_LOG_INFO("nvCOMP is enabled for %s compression", compression_type_name(compression));
   }
 
   return reason;
@@ -445,11 +445,11 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
   memo_map_lock.unlock();
 
   if (reason.has_value()) {
-    CUDF_LOG_INFO("nvCOMP is disabled for {} decompression; reason: {}",
+    CUDF_LOG_INFO("nvCOMP is disabled for %s decompression; reason: %s",
                   compression_type_name(compression),
                   reason.value());
   } else {
-    CUDF_LOG_INFO("nvCOMP is enabled for {} decompression", compression_type_name(compression));
+    CUDF_LOG_INFO("nvCOMP is enabled for %s decompression", compression_type_name(compression));
   }
 
   return reason;
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index e05353ee822..0d51526d925 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -771,7 +771,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
     if (!reader_opts.is_enabled_mangle_dupe_cols()) {
       for (auto& col_name : column_names) {
         if (++col_names_counts[col_name] > 1) {
-          CUDF_LOG_WARN("Multiple columns with name {}; only the first appearance is parsed",
+          CUDF_LOG_WARN("Multiple columns with name %s; only the first appearance is parsed",
                         col_name);
 
           auto const idx    = &col_name - column_names.data();
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 7facc6497ed..469f933f918 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index 1572b7246c0..1f84d1f81dc 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -132,6 +132,177 @@ struct orcdec_state_s {
   } vals;
 };
 
+/**
+ * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group.
+ *
+ * This class is used to address a special case, where the first run of the DATA stream spans two
+ * adjacent row groups and its length is greater than the maximum length allowed to be consumed.
+ * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be
+ * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type
+ * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a
+ * local variable and does not reside in the shared memory.
+ */
+class run_cache_manager {
+ private:
+  enum class status : uint8_t {
+    DISABLED,  ///< Run cache manager is disabled. No caching will be performed. If the special case
+               ///< happens, the run cache manager will be set to this status after the cache read
+               ///< is completed. This status also applies when the special case does not happen.
+    CAN_WRITE_TO_CACHE,  ///< Run cache manager is ready for write. If the special case happens, the
+                         ///< run cache manager will be set to this status.
+    CAN_READ_FROM_CACHE,  ///< Run cache manager is ready for read. If the special case happens, the
+                          ///< run cache manager will be set to this status after the cache write is
+                          ///< completed.
+  };
+
+ public:
+  /**
+   * @brief Initialize the run cache manager.
+   *
+   * @param[in] s ORC decoder state.
+   */
+  __device__ void initialize(orcdec_state_s* s)
+  {
+    _status          = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP)
+                         ? status::CAN_WRITE_TO_CACHE
+                         : status::DISABLED;
+    _reusable_length = 0;
+    _run_length      = 0;
+  }
+
+ private:
+  status _status;  ///< The status of the run cache manager.
+  uint32_t
+    _reusable_length;  ///< The number of data to be cached and reused later. For example, if a run
+                       ///< has a length of 512 but the maximum length allowed to be consumed is
+                       ///< capped at 162, then 350 (512-162) data will be cached.
+  uint32_t _run_length;  ///< The length of the run, 512 in the above example.
+  friend class cache_helper;
+};
+
+/**
+ * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for
+ * a row group.
+ *
+ * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is
+ * in the local storage (as an optimization). If a function is to use run_cache_manager, both the
+ * manager and the cache objects need to be passed. This class is introduced to simplify the
+ * function call, so that only a single cache_helper object needs to be passed. To that end, public
+ * methods originally belonging to run_cache_manager have been moved to this class.
+ */
+class cache_helper {
+ public:
+  /**
+   * @brief Constructor.
+   *
+   * @param[in] run_cache_manager_inst An instance of run_cache_manager.
+   */
+  __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst)
+    : _manager(run_cache_manager_inst)
+  {
+  }
+
+  /**
+   * @brief Set the reusable length object.
+   *
+   * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the
+   * DATA stream.
+   * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed
+   * by the decoder when processing the SECONDARY stream.
+   */
+  __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length)
+  {
+    if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) {
+      _manager._run_length = run_length;
+      _manager._reusable_length =
+        (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0;
+    }
+  }
+
+  /**
+   * @brief Adjust the maximum length allowed to be consumed when the length of the first run is
+   * greater than it.
+   *
+   * @param[in] max_length The maximum length allowed to be consumed for the DATA stream.
+   * @return A new maximum length.
+   */
+  [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length)
+  {
+    auto new_max_length{max_length};
+    if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) {
+      new_max_length -= _manager._reusable_length;
+    }
+    return new_max_length;
+  }
+
+  /**
+   * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache.
+   *
+   * @param[in] src Intermediate buffer for the DATA stream.
+   */
+  __device__ void write_to_cache(int64_t* src)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    __syncthreads();
+
+    // All threads in the block always take a uniform code path for the following branches.
+    // _reusable_length ranges between [0, 512].
+    if (_manager._reusable_length > 0) {
+      auto const length_to_skip = _manager._run_length - _manager._reusable_length;
+      if (tid < _manager._reusable_length) {
+        auto const src_idx = tid + length_to_skip;
+        _storage           = src[src_idx];
+      }
+      if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; }
+    } else {
+      if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; }
+    }
+
+    __syncthreads();
+  }
+
+  /**
+   * @brief Copy the cached data to the intermediate buffer for the DATA stream.
+   *
+   * @param[in,out] dst Intermediate buffer for the DATA stream.
+   * @param[in,out] rle Run length decoder state object.
+   */
+  __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle)
+  {
+    if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; }
+
+    auto const tid = threadIdx.x;
+
+    // First, shift the data up
+    auto const dst_idx = tid + _manager._reusable_length;
+    auto const v       = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0;
+    __syncthreads();
+
+    if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; }
+    __syncthreads();
+
+    // Second, insert the cached data
+    if (tid < _manager._reusable_length) { dst[tid] = _storage; }
+    __syncthreads();
+
+    if (tid == 0) {
+      // Disable the run cache manager, since cache write-and-read happens at most once per row
+      // group.
+      _manager._status = run_cache_manager::status::DISABLED;
+      rle->num_vals += _manager._reusable_length;
+    }
+
+    __syncthreads();
+  }
+
+ private:
+  run_cache_manager& _manager;  ///< An instance of run_cache_manager.
+  int64_t _storage;             ///< Per-thread cache storage.
+};
+
 /**
  * @brief Initializes byte stream, modifying length and start position to keep the read pointer
  * 8-byte aligned.
@@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = {
  * @param[in] maxvals maximum number of values to decode
  * @param[in] t thread id
  * @param[in] has_buffered_values If true, means there are already buffered values
+ * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage
+ * caching of the first run of the DATA stream.
  *
  * @return number of values decoded
  */
@@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
                                          T* vals,
                                          uint32_t maxvals,
                                          int t,
-                                         bool has_buffered_values = false)
+                                         bool has_buffered_values        = false,
+                                         cache_helper* cache_helper_inst = nullptr)
 {
   if (t == 0) {
+    if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); }
     uint32_t maxpos  = min(bs->len, bs->pos + (bytestream_buffer_size - 8u));
     uint32_t lastpos = bs->pos;
     auto numvals     = 0;
@@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
           l += deltapos;
         }
       }
+
+      if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); }
+
       if ((numvals != 0) and (numvals + n > maxvals)) break;
       // case where there are buffered values and can't consume a whole chunk
       // from decoded values, so skip adding any more to buffer, work on buffered values and then
@@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs,
     __syncwarp();
   }
   __syncthreads();
+  // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the
+  // data type is int64_t.
+  if constexpr (cuda::std::is_same_v<T, int64_t>) {
+    if (cache_helper_inst != nullptr) {
+      // Run cache is read from during the 2nd iteration of the top-level while loop in
+      // gpuDecodeOrcColumnData().
+      cache_helper_inst->read_from_cache(vals, rle);
+      // Run cache is written to during the 1st iteration of the loop.
+      cache_helper_inst->write_to_cache(vals);
+    }
+  }
   return rle->num_vals;
 }
 
@@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   // Struct doesn't have any data in itself, so skip
   bool const is_valid       = s->chunk.type_kind != STRUCT;
   size_t const max_num_rows = s->chunk.column_num_rows;
+  __shared__ run_cache_manager run_cache_manager_inst;
+  cache_helper cache_helper_inst(run_cache_manager_inst);
   if (t == 0 and is_valid) {
     // If we have an index, seek to the initial run and update row positions
     if (num_rowgroups > 0) {
@@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
     bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]);
     bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]);
+
+    run_cache_manager_inst.initialize(s);
   }
   __syncthreads();
 
@@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size)
         if (is_rlev1(s->chunk.encoding_kind)) {
           numvals = Integer_RLEv1<int64_t>(bs, &s->u.rlev1, s->vals.i64, numvals, t);
         } else {
-          numvals = Integer_RLEv2<int64_t>(bs, &s->u.rlev2, s->vals.i64, numvals, t);
+          numvals = Integer_RLEv2<int64_t>(bs,
+                                           &s->u.rlev2,
+                                           s->vals.i64,
+                                           numvals,
+                                           t,
+                                           false /**has_buffered_values */,
+                                           &cache_helper_inst);
         }
         if (s->chunk.type_kind == DECIMAL) {
           // If we're using an index, we may have to drop values from the initial run
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index b5f9b894c46..0d40a1f7b1b 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6b1a20701f9..77924ac0f35 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -719,7 +719,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
             // all others
             default:
               CUDF_LOG_WARN(
-                "Unsupported page encoding requested: {}; the requested encoding will be ignored",
+                "Unsupported page encoding requested: %d; the requested encoding will be ignored",
                 static_cast<int>(col_meta.get_encoding()));
               return;
           }
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index dfa5d46cf48..975206646c6 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ class file_sink : public data_sink {
     if (cufile_integration::is_kvikio_enabled()) {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath, "w");
-      CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
+      CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode %s.",
                     _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_out = detail::make_cufile_output(filepath);
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 38dedcc2627..87b3c6facdf 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ class file_source : public datasource {
     if (cufile_integration::is_kvikio_enabled()) {
       cufile_integration::set_up_kvikio();
       _kvikio_file = kvikio::FileHandle(filepath);
-      CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
+      CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.",
                     _kvikio_file.is_compat_mode_preferred() ? "on" : "off");
     } else {
       _cufile_in = detail::make_cufile_input(filepath);
@@ -230,7 +230,7 @@ class memory_mapped_source : public file_source {
   {
     if (_map_addr != nullptr) {
       auto const result = munmap(_map_addr, _map_size);
-      if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); }
+      if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); }
       _map_addr = nullptr;
     }
   }
diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp
index b9613428418..acfd2221797 100644
--- a/cpp/src/io/utilities/getenv_or.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,10 +32,17 @@ T getenv_or(std::string_view env_var_name, T default_val)
 {
   auto const env_val = std::getenv(env_var_name.data());
   if (env_val != nullptr) {
-    CUDF_LOG_INFO("Environment variable {} read as {}", env_var_name, env_val);
+    CUDF_LOG_INFO("Environment variable %.*s read as %s",
+                  static_cast<int>(env_var_name.length()),
+                  env_var_name.data(),
+                  env_val);
   } else {
-    CUDF_LOG_INFO(
-      "Environment variable {} is not set, using default value {}", env_var_name, default_val);
+    std::stringstream ss;
+    ss << default_val;
+    CUDF_LOG_INFO("Environment variable %.*s is not set, using default value %s",
+                  static_cast<int>(env_var_name.length()),
+                  env_var_name.data(),
+                  ss.str());
   }
 
   if (env_val == nullptr) { return default_val; }
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index ce4d2067b82..d1a01ee76e4 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -47,28 +47,19 @@ namespace cudf {
 namespace detail {
 namespace {
 
-template <cudf::has_nested HasNested>
-auto prepare_device_equal(
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> probe,
-  bool has_nulls,
-  cudf::null_equality compare_nulls)
-{
-  auto const two_table_equal =
-    cudf::experimental::row::equality::two_table_comparator(probe, build);
-  return comparator_adapter{two_table_equal.equal_to<HasNested == cudf::has_nested::YES>(
-    nullate::DYNAMIC{has_nulls}, compare_nulls)};
-}
+bool constexpr has_nulls = true;  ///< Always has nulls
 
 /**
  * @brief Device functor to create a pair of {hash_value, row_index} for a given row.
- *
- * @tparam Hasher The type of internal hasher to compute row hash.
  */
-template <typename Hasher, typename T>
+template <typename T>
 class build_keys_fn {
+  using hasher =
+    cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
+                                                     cudf::nullate::DYNAMIC>;
+
  public:
-  CUDF_HOST_DEVICE build_keys_fn(Hasher const& hash) : _hash{hash} {}
+  CUDF_HOST_DEVICE constexpr build_keys_fn(hasher const& hash) : _hash{hash} {}
 
   __device__ __forceinline__ auto operator()(size_type i) const noexcept
   {
@@ -76,7 +67,7 @@ class build_keys_fn {
   }
 
  private:
-  Hasher _hash;
+  hasher _hash;
 };
 
 /**
@@ -92,26 +83,19 @@ struct output_fn {
 };
 }  // namespace
 
-template <cudf::has_nested HasNested>
-distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
-                                                  cudf::table_view const& probe,
-                                                  bool has_nulls,
-                                                  cudf::null_equality compare_nulls,
-                                                  rmm::cuda_stream_view stream)
-  : _has_nulls{has_nulls},
+distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+                                       cudf::null_equality compare_nulls,
+                                       rmm::cuda_stream_view stream)
+  : _has_nested_columns{cudf::has_nested_columns(build)},
     _nulls_equal{compare_nulls},
     _build{build},
-    _probe{probe},
     _preprocessed_build{
       cudf::experimental::row::equality::preprocessed_table::create(_build, stream)},
-    _preprocessed_probe{
-      cudf::experimental::row::equality::preprocessed_table::create(_probe, stream)},
     _hash_table{build.num_rows(),
                 CUCO_DESIRED_LOAD_FACTOR,
                 cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
                                            rhs_index_type{JoinNoneValue}}},
-                prepare_device_equal<HasNested>(
-                  _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls),
+                always_not_equal{},
                 {},
                 cuco::thread_scope_device,
                 cuco_storage_type{},
@@ -124,10 +108,10 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   if (this->_build.num_rows() == 0) { return; }
 
   auto const row_hasher = experimental::row::hash::row_hasher{this->_preprocessed_build};
-  auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
+  auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
 
-  auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_hasher), rhs_index_type>{d_hasher});
+  auto const iter =
+    cudf::detail::make_counting_transform_iterator(0, build_keys_fn<rhs_index_type>{d_hasher});
 
   size_type const build_table_num_rows{build.num_rows()};
   if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) {
@@ -146,15 +130,15 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   }
 }
 
-template <cudf::has_nested HasNested>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
-                                          rmm::device_async_resource_ref mr) const
+distinct_hash_join::inner_join(cudf::table_view const& probe,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
-  size_type const probe_table_num_rows{this->_probe.num_rows()};
+  size_type const probe_table_num_rows{probe.num_rows()};
 
   // If output size is zero, return immediately
   if (probe_table_num_rows == 0) {
@@ -162,25 +146,62 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
+  auto preprocessed_probe =
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+  auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+    preprocessed_probe, _preprocessed_build);
+
   auto build_indices =
     std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
   auto probe_indices =
     std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
 
-  auto const probe_row_hasher =
-    cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
-  auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
-  auto const iter           = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
+  auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
+  auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+  auto const iter             = cudf::detail::make_counting_transform_iterator(
+    0, build_keys_fn<lhs_index_type>{d_probe_hasher});
 
   auto found_indices = rmm::device_uvector<size_type>(probe_table_num_rows, stream);
   auto const found_begin =
     thrust::make_transform_output_iterator(found_indices.begin(), output_fn{});
 
-  // TODO conditional find for nulls once `cuco::static_set::find_if` is added
-  // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal
-  // to `JoinNoneValue`, then `idx` has a match in the hash set.
-  this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value());
+  auto const comparator_helper = [&](auto device_comparator) {
+    // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not
+    // equal to `JoinNoneValue`, then `idx` has a match in the hash set.
+    if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) {
+      this->_hash_table.find_async(iter,
+                                   iter + probe_table_num_rows,
+                                   comparator_adapter{device_comparator},
+                                   hasher{},
+                                   found_begin,
+                                   stream.value());
+    } else {
+      auto stencil = thrust::counting_iterator<size_type>{0};
+      auto const row_bitmask =
+        cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first;
+      auto const pred =
+        cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
+
+      this->_hash_table.find_if_async(iter,
+                                      iter + probe_table_num_rows,
+                                      stencil,
+                                      pred,
+                                      comparator_adapter{device_comparator},
+                                      hasher{},
+                                      found_begin,
+                                      stream.value());
+    }
+  };
+
+  if (_has_nested_columns) {
+    auto const device_comparator =
+      two_table_equal.equal_to<true>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+    comparator_helper(device_comparator);
+  } else {
+    auto const device_comparator =
+      two_table_equal.equal_to<false>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+    comparator_helper(device_comparator);
+  }
 
   auto const tuple_iter = cudf::detail::make_counting_transform_iterator(
     0,
@@ -203,16 +224,17 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 
-  return {std::move(build_indices), std::move(probe_indices)};
+  return {std::move(probe_indices), std::move(build_indices)};
 }
 
-template <cudf::has_nested HasNested>
-std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::left_join(
-  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
+std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
+  cudf::table_view const& probe,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
-  size_type const probe_table_num_rows{this->_probe.num_rows()};
+  size_type const probe_table_num_rows{probe.num_rows()};
 
   // If output size is zero, return empty
   if (probe_table_num_rows == 0) {
@@ -227,80 +249,82 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::l
     thrust::fill(
       rmm::exec_policy_nosync(stream), build_indices->begin(), build_indices->end(), JoinNoneValue);
   } else {
-    auto const probe_row_hasher =
-      cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
-    auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
-    auto const iter           = cudf::detail::make_counting_transform_iterator(
-      0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
+    auto preprocessed_probe =
+      cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
+    auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
+      preprocessed_probe, _preprocessed_build);
+
+    auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
+    auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+    auto const iter             = cudf::detail::make_counting_transform_iterator(
+      0, build_keys_fn<lhs_index_type>{d_probe_hasher});
 
     auto const output_begin =
       thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
-    // TODO conditional find for nulls once `cuco::static_set::find_if` is added
-    this->_hash_table.find_async(iter, iter + probe_table_num_rows, output_begin, stream.value());
+    auto const comparator_helper = [&](auto device_comparator) {
+      if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) {
+        this->_hash_table.find_async(iter,
+                                     iter + probe_table_num_rows,
+                                     comparator_adapter{device_comparator},
+                                     hasher{},
+                                     output_begin,
+                                     stream.value());
+      } else {
+        auto stencil = thrust::counting_iterator<size_type>{0};
+        auto const row_bitmask =
+          cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first;
+        auto const pred =
+          cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
+
+        this->_hash_table.find_if_async(iter,
+                                        iter + probe_table_num_rows,
+                                        stencil,
+                                        pred,
+                                        comparator_adapter{device_comparator},
+                                        hasher{},
+                                        output_begin,
+                                        stream.value());
+      }
+    };
+
+    if (_has_nested_columns) {
+      auto const device_comparator =
+        two_table_equal.equal_to<true>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+      comparator_helper(device_comparator);
+    } else {
+      auto const device_comparator =
+        two_table_equal.equal_to<false>(nullate::DYNAMIC{has_nulls}, _nulls_equal);
+      comparator_helper(device_comparator);
+    }
   }
 
   return build_indices;
 }
 }  // namespace detail
 
-template <>
-distinct_hash_join<cudf::has_nested::YES>::~distinct_hash_join() = default;
-
-template <>
-distinct_hash_join<cudf::has_nested::NO>::~distinct_hash_join() = default;
-
-template <>
-distinct_hash_join<cudf::has_nested::YES>::distinct_hash_join(cudf::table_view const& build,
-                                                              cudf::table_view const& probe,
-                                                              nullable_join has_nulls,
-                                                              null_equality compare_nulls,
-                                                              rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<impl_type>(
-      build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)}
-{
-}
-
-template <>
-distinct_hash_join<cudf::has_nested::NO>::distinct_hash_join(cudf::table_view const& build,
-                                                             cudf::table_view const& probe,
-                                                             nullable_join has_nulls,
-                                                             null_equality compare_nulls,
-                                                             rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<impl_type>(
-      build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)}
-{
-}
+distinct_hash_join::~distinct_hash_join() = default;
 
-template <>
-std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-          std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<cudf::has_nested::YES>::inner_join(rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr) const
+distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+                                       null_equality compare_nulls,
+                                       rmm::cuda_stream_view stream)
+  : _impl{std::make_unique<impl_type>(build, compare_nulls, stream)}
 {
-  return _impl->inner_join(stream, mr);
 }
 
-template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr) const
-{
-  return _impl->inner_join(stream, mr);
-}
-
-template <>
-std::unique_ptr<rmm::device_uvector<size_type>>
-distinct_hash_join<cudf::has_nested::YES>::left_join(rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr) const
+distinct_hash_join::inner_join(cudf::table_view const& probe,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(stream, mr);
+  return _impl->inner_join(probe, stream, mr);
 }
 
-template <>
-std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<cudf::has_nested::NO>::left_join(
-  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
+std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
+  cudf::table_view const& probe,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(stream, mr);
+  return _impl->left_join(probe, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 4f75908fe72..37c5698f654 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/memory_resource.hpp>
 
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index b04e9961e01..b5063931485 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
   if (str_length == 0) return tgt_length;
   if (tgt_length == 0) return str_length;
 
-  auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
-  auto itr   = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
-  // .first is min and .second is max
-  auto const [n, m] = std::minmax(str_length, tgt_length);
+  auto begin   = str_length < tgt_length ? d_str.begin() : d_tgt.begin();
+  auto itr     = str_length < tgt_length ? d_tgt.begin() : d_str.begin();
+  auto const n = cuda::std::min(str_length, tgt_length);
+  auto const m = cuda::std::max(str_length, tgt_length);
   // setup compute buffer pointers
   auto v0 = buffer;
   auto v1 = v0 + n + 1;
@@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str,
       auto sub_cost = v0[j] + (*itr != *itr_tgt);
       auto del_cost = v0[j + 1] + 1;
       auto ins_cost = v1[j] + 1;
-      v1[j + 1]     = std::min(std::min(sub_cost, del_cost), ins_cost);
+      v1[j + 1]     = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost);
     }
     thrust::swap(v0, v1);
   }
@@ -170,7 +171,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                      ? d_targets.element<cudf::string_view>(0)
                                      : d_targets.element<cudf::string_view>(idx);
                       // just need 2 integers for each character of the shorter string
-                      return (std::min(d_str.length(), d_tgt.length()) + 1) * 2;
+                      return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2;
                     });
 
   // get the total size of the temporary compute buffer
@@ -241,7 +242,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
       if (d_str1.empty() || d_str2.empty()) { return; }
       // the temp size needed is 2 integers per character of the shorter string
       d_offsets[idx - ((row + 1) * (row + 2)) / 2] =
-        (std::min(d_str1.length(), d_str2.length()) + 1) * 2;
+        (cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2;
     });
 
   // get the total size for the compute buffer
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 2de94a4eb59..247440212d0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string
     }
   }
   auto const char_count = warp_reduce(temp_storage).Sum(count);
-  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); }
 }
 
 /**
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 9a44d9477ab..9ce17c36b1f 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,14 +40,13 @@
 
 #include <cooperative_groups.h>
 #include <cuda/atomic>
+#include <cuda/std/limits>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-#include <limits>
-
 namespace nvtext {
 namespace detail {
 namespace {
@@ -156,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     // initialize the output -- only needed for wider strings
     auto d_output = d_results + (str_idx * param_count);
     for (auto i = lane_idx; i < param_count; i += tile_size) {
-      d_output[i] = std::numeric_limits<hash_value_type>::max();
+      d_output[i] = cuda::std::numeric_limits<hash_value_type>::max();
     }
   }
 }
@@ -226,7 +225,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       ? section_size
       : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
 
-  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const init     = size_bytes == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = block.thread_rank();
   auto const d_output = d_results + (str_idx * parameter_a.size());
 
@@ -235,7 +234,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
 
   // constants used in the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
-  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+  constexpr hash_value_type hash_max = cuda::std::numeric_limits<hash_value_type>::max();
 
   // found to be an efficient shared memory size for both hash types
   __shared__ hash_value_type block_values[block_size * params_per_thread];
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 943bcbe9b3a..b041ce3ce0a 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -196,7 +197,7 @@ struct sub_offset_fn {
   {
     // keep delimiter search within this sub-block
     auto const end =
-      d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
+      d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
     // starting point of this sub-block
     auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE);
     while ((itr < end) &&
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index a3bed45e4bd..7a39199011e 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings,
   constexpr uint8_t max_utf8_blocks_for_char    = 4;
   uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0};
 
-  for (int i = 0; i < std::min(static_cast<size_t>(max_utf8_blocks_for_char),
-                               total_bytes - start_byte_for_thread);
+  for (int i = 0; i < cuda::std::min(static_cast<size_t>(max_utf8_blocks_for_char),
+                                     total_bytes - start_byte_for_thread);
        ++i) {
     utf8_blocks[i] = strings[start_byte_for_thread + i];
   }
diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu
index dd1e8ddb027..19f144dd158 100644
--- a/cpp/src/text/subword/wordpiece_tokenizer.cu
+++ b/cpp/src/text/subword/wordpiece_tokenizer.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
+#include <cuda/std/limits>
 #include <thrust/copy.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -87,7 +89,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
 
   // Deal with the start_word_indices array
   if (char_for_thread < num_code_points) {
-    uint32_t val_to_write = std::numeric_limits<uint32_t>::max();
+    uint32_t val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) &&
         (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) {
       val_to_write = char_for_thread;
@@ -95,7 +97,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
     start_word_indices[char_for_thread] = val_to_write;
 
     // Deal with the end_word_indices_array
-    val_to_write = std::numeric_limits<uint32_t>::max();
+    val_to_write = cuda::std::numeric_limits<uint32_t>::max();
     if ((code_points[char_for_thread] != SPACE_CODE_POINT) &&
         (char_for_thread + 1 < num_code_points) &&
         (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) {
@@ -103,7 +105,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi
     }
     end_word_indices[char_for_thread] = val_to_write;
 
-    token_ids[char_for_thread]       = std::numeric_limits<uint32_t>::max();
+    token_ids[char_for_thread]       = cuda::std::numeric_limits<uint32_t>::max();
     tokens_per_word[char_for_thread] = 0;
   }
 }
@@ -214,7 +216,7 @@ struct mark_special_tokens {
   __device__ void operator()(size_t idx) const
   {
     uint32_t const start_index = start_word_indices[idx];
-    if ((start_index == std::numeric_limits<uint32_t>::max()) ||
+    if ((start_index == cuda::std::numeric_limits<uint32_t>::max()) ||
         ((start_index + MIN_ST_WIDTH + 2) > num_code_points))
       return;
     if (code_points[start_index] != '[') return;
@@ -225,12 +227,12 @@ struct mark_special_tokens {
     uint32_t const end_index = [&] {
       auto const begin = start_word_indices + start_pos;
       auto const width =
-        std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
+        cuda::std::min(static_cast<size_t>(MAX_ST_WIDTH + 1), (num_code_points - start_pos));
       auto const end = begin + width;
       // checking the next start-word is more reliable than arbitrarily searching for ']'
       // in case the text is split across string rows
       auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) {
-        return swi != std::numeric_limits<uint32_t>::max();
+        return swi != cuda::std::numeric_limits<uint32_t>::max();
       });
       return iter == end ? start_index : static_cast<uint32_t>(iter - start_word_indices);
     }();
@@ -254,11 +256,11 @@ struct mark_special_tokens {
     thrust::fill(thrust::seq,
                  start_word_indices + start_index + 1,  // keep the first one
                  start_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
     thrust::fill(thrust::seq,
                  end_word_indices + start_index,
                  end_word_indices + end_index + 1,
-                 std::numeric_limits<uint32_t>::max());
+                 cuda::std::numeric_limits<uint32_t>::max());
 
     // reset the new end-word index
     end_word_indices[end_pos] = end_pos + 1;
@@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points,
       // We need to clean up the global array. This case is very uncommon.
       //  Only 0.016% of words cannot be resolved to a token from the squad dev set.
       for (uint32_t i = 1; i < num_values_tokenized; ++i) {
-        token_ids[token_start + i] = std::numeric_limits<uint32_t>::max();
+        token_ids[token_start + i] = cuda::std::numeric_limits<uint32_t>::max();
       }
       num_values_tokenized = 0;
     }
@@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu
 }
 
 struct copy_if_fn {  // inline lambda not allowed in private or protected member function
-  __device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits<uint32_t>::max(); }
+  __device__ bool operator()(uint32_t cp)
+  {
+    return cp != cuda::std::numeric_limits<uint32_t>::max();
+  }
 };
 
 struct tranform_fn {  // just converting uint8 value to uint32
@@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   auto itr_end = thrust::remove(rmm::exec_policy(stream),
                                 device_word_indices.begin(),
                                 device_word_indices.end(),
-                                std::numeric_limits<uint32_t>::max());
+                                cuda::std::numeric_limits<uint32_t>::max());
 
   // The number of tokens selected will be double the number of words since we
   // select from both the start and end index arrays.
@@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre
   // token so this will always have enough memory to store the contiguous tokens.
   uint32_t* contiguous_token_ids = device_code_points;
   auto const copy_size           =  // thrust::copy_if limited to copying int-max values
-    std::min(device_token_ids.size(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
+    cuda::std::min(device_token_ids.size(),
+                   static_cast<std::size_t>(cuda::std::numeric_limits<int>::max()));
   auto ids_itr       = device_token_ids.begin();
   auto const ids_end = device_token_ids.end();
   while (ids_itr != ids_end) {
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 4196523d211..73c4567d3a4 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,7 @@ class fixed_pinned_pool_memory_resource {
       pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)},
       pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)}
   {
-    CUDF_LOG_INFO("Pinned pool size = {}", pool_size_);
+    CUDF_LOG_INFO("Pinned pool size = %zu", pool_size_);
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
     pool_begin_ = pool_->allocate_async(pool_size_, stream_);
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index b0f2d8c0637..80364885980 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,8 @@ class rmm_cuda_stream_pool : public cuda_stream_pool {
   std::vector<rmm::cuda_stream_view> get_streams(std::size_t count) override
   {
     if (count > STREAM_POOL_SIZE) {
-      CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE);
+      CUDF_LOG_WARN(
+        "get_streams called with count (%zu) > pool size (%zu)", count, STREAM_POOL_SIZE);
     }
     auto streams = std::vector<rmm::cuda_stream_view>();
     for (uint32_t i = 0; i < count; i++) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e5c29314203..344979e1288 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -192,6 +192,7 @@ ConfigureTest(
   hashing/sha256_test.cpp
   hashing/sha384_test.cpp
   hashing/sha512_test.cpp
+  hashing/xxhash_32_test.cpp
   hashing/xxhash_64_test.cpp
 )
 
diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp
new file mode 100644
index 00000000000..9e3c66b0d0b
--- /dev/null
+++ b/cpp/tests/hashing/xxhash_32_test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/hashing.hpp>
+
+class XXHash_32_Test : public cudf::test::BaseFixture {};
+
+TEST_F(XXHash_32_Test, TestInteger)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 42, 825}};
+  auto constexpr seed = 0u;
+  auto const output   = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({148298089u, 1161967057u, 1066694813u});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, TestDouble)
+{
+  auto col1           = cudf::test::fixed_width_column_wrapper<double>{{-8., 25., 90.}};
+  auto constexpr seed = 42u;
+
+  auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({2276435783u, 3120212431u, 3454197470u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
+
+TEST_F(XXHash_32_Test, StringType)
+{
+  auto col1           = cudf::test::strings_column_wrapper({"I", "am", "AI"});
+  auto constexpr seed = 825u;
+
+  auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed);
+
+  // Expected results were generated with the reference implementation:
+  // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<uint32_t>({320624298u, 1612654309u, 1409499009u});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected);
+}
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 9070efa38fe..e1ec8cda3ac 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,7 +53,7 @@ struct DistinctJoinTest : public cudf::test::BaseFixture {
     cudf::table_view const& expected_table,
     cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK)
   {
-    auto const& [build_join_indices, probe_join_indices] = result;
+    auto const& [probe_join_indices, build_join_indices] = result;
 
     auto build_indices_span = cudf::device_span<cudf::size_type const>{*build_join_indices};
     auto probe_indices_span = cudf::device_span<cudf::size_type const>{*probe_join_indices};
@@ -89,10 +89,9 @@ TEST_F(DistinctJoinTest, IntegerInnerJoin)
   auto build_table = cudf::table_view{{build->view()}};
   auto probe_table = cudf::table_view{{probe->view()}};
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{
-    build_table, probe_table, cudf::nullable_join::NO};
+  auto distinct_join = cudf::distinct_hash_join{build_table};
 
-  auto result = distinct_join.inner_join();
+  auto result = distinct_join.inner_join(probe_table);
 
   auto constexpr gold_size = size / 2;
   auto gold                = cudf::sequence(gold_size, init, cudf::numeric_scalar<int32_t>{2});
@@ -120,8 +119,8 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{1, 2}};
   strcol_wrapper col_gold_1({"s0", "s0"});
@@ -162,8 +161,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
@@ -229,8 +228,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
@@ -284,8 +283,8 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   this->compare_to_reference(build.view(), probe.view(), result, build.view());
 }
@@ -307,9 +306,9 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -332,8 +331,8 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.inner_join();
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.inner_join(probe.view());
 
   this->compare_to_reference(build.view(), probe.view(), result, probe.view());
 }
@@ -355,9 +354,9 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
   Table build(std::move(cols0));
   Table probe(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -391,9 +390,9 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
   cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
     build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
@@ -416,9 +415,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::NO>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
   strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
@@ -461,9 +460,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
   Table probe(std::move(cols0));
   Table build(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join<cudf::has_nested::YES>{build.view(), probe.view()};
-  auto result        = distinct_join.left_join();
-  auto gather_map    = std::pair{std::move(result), get_left_indices(result->size())};
+  auto distinct_join = cudf::distinct_hash_join{build.view()};
+  auto result        = distinct_join.left_join(probe.view());
+  auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   auto col0_gold_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index 58396115a54..b5d20325b75 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel)
   cudf::default_logger().warn("warn");
   cudf::default_logger().error("error");
   cudf::default_logger().critical("critical");
-  ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n");
+  ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n");
 }
 
 TEST_F(LoggerTest, CustomLevel)
diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 5024747227e..222b698a78d 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -63,11 +63,11 @@ keyword arguments, cuDF is not able to provide GPU acceleration and
 `cudf.pandas` will fall back to the CPU.
 
 The most accurate way to assess which functions run on the GPU is to try
-running the code while using the `cudf.pandas` profiling features. The
-profiler will indicate which functions ran on GPU / CPU. To improve
-performance, try to use only functionality that can run entirely on GPU.
-This helps reduce the number of memory transfers needed to fallback to
-CPU.
+running the code while using the `cudf.pandas` [profiling
+features](cudf-pandas-profiling). The profiler will indicate which functions
+ran on GPU / CPU. To improve performance, try to use only functionality that
+can run entirely on GPU.  This helps reduce the number of memory transfers
+needed to fallback to CPU.
 
 ## How can I improve performance of my workflow with `cudf.pandas`?
 
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 089f283e25d..fed63c2dd0f 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -75,6 +75,7 @@ with Pool(4) as pool:
     ...
 ```
 
+(cudf-pandas-profiling)=
 ## Profiling `cudf.pandas`
 
 `cudf.pandas` will attempt to use the GPU whenever possible and fall
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
index 53af52eff07..5e544e92a77 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f
    * @param filePath Full path of the input Parquet file to read.
    */
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) {
-    handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
-
+    long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe
   public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
                               ParquetOptions opts, HostMemoryBuffer buffer,
                               long offset, long len) {
-    handle = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
-        buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+    multiHostBufferSourceHandle = handles[1];
+  }
 
+  /**
+   * Construct the reader instance from a read limit and data in host memory buffers.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or
+   *                      0 if there is no limit
+   * @param opts The options for Parquet reading.
+   * @param buffers Array of buffers containing the file data. The buffers are logically
+   *                concatenated to construct the file being read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit,
+                              ParquetOptions opts, HostMemoryBuffer... buffers) {
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
+    long[] handles = create(chunkSizeByteLimit,passReadLimit,  opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        addrsSizes, opts.timeUnit().typeId.getNativeId());
+    handle = handles[0];
     if (handle == 0) {
       throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
     }
+    multiHostBufferSourceHandle = handles[1];
   }
 
   /**
@@ -181,6 +211,10 @@ public void close() {
       DataSourceHelper.destroyWrapperDataSource(dataSourceHandle);
       dataSourceHandle = 0;
     }
+    if (multiHostBufferSourceHandle != 0) {
+      destroyMultiHostBufferSource(multiHostBufferSourceHandle);
+      multiHostBufferSourceHandle = 0;
+    }
   }
 
 
@@ -196,6 +230,8 @@ public void close() {
 
   private long dataSourceHandle = 0;
 
+  private long multiHostBufferSourceHandle = 0;
+
   /**
    * Create a native chunked Parquet reader object on heap and return its memory address.
    *
@@ -206,13 +242,12 @@ public void close() {
    * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
    * @param binaryToString Whether to convert the corresponding column to String if it is binary.
    * @param filePath Full path of the file to read, or given as null if reading from a buffer.
-   * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer.
-   * @param length The length of the buffer to read from.
+   * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers.
    * @param timeUnit Return type of time unit for timestamps.
    */
-  private static native long create(long chunkSizeByteLimit, long passReadLimit,
-                                    String[] filterColumnNames, boolean[] binaryToString,
-                                    String filePath, long bufferAddrs, long length, int timeUnit);
+  private static native long[] create(long chunkSizeByteLimit, long passReadLimit,
+                                      String[] filterColumnNames, boolean[] binaryToString,
+                                      String filePath, long[] bufferAddrsSizes, int timeUnit);
 
   private static native long createWithDataSource(long chunkedSizeByteLimit,
       String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle);
@@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit,
   private static native long[] readChunk(long handle);
 
   private static native void close(long handle);
+
+  private static native void destroyMultiHostBufferSource(long handle);
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index b01ce31b1f3..298f2cff6f3 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length,
    *                           all of them
    * @param binaryToString     whether to convert this column to String if binary
    * @param filePath           the path of the file to read, or null if no path should be read.
-   * @param address            the address of the buffer to read from or 0 if we should not.
-   * @param length             the length of the buffer to read from.
+   * @param addrsAndSizes      the address and size pairs for every buffer or null for no buffers.
    * @param timeUnit           return type of TimeStamp in units
    */
   private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath,
-                                           long address, long length, int timeUnit) throws CudfException;
+                                           long[] addrsAndSizes, int timeUnit) throws CudfException;
 
   private static native long[] readParquetFromDataSource(String[] filterColumnNames,
                                                          boolean[] binaryToString, int timeUnit,
@@ -1357,7 +1356,7 @@ public static Table readParquet(File path) {
    */
   public static Table readParquet(ParquetOptions opts, File path) {
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()));
+        path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()));
   }
 
   /**
@@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset,
     }
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffer raw parquet formatted bytes.
+   * @param offset the starting offset into buffer.
+   * @param len the number of bytes to parse.
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) {
     return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get());
   }
@@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer,
     assert len > 0;
     assert len <= buffer.getLength() - offset;
     assert offset >= 0 && offset < buffer.length;
+    long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len };
+    return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
+  }
+
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated
+   *                in order to construct the file being read.
+   * @return the data parsed as a table on the GPU.
+   */
+  public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) {
+    assert buffers.length > 0;
+    long[] addrsSizes = new long[buffers.length * 2];
+    for (int i = 0; i < buffers.length; i++) {
+      addrsSizes[i * 2] = buffers[i].getAddress();
+      addrsSizes[(i * 2) + 1] = buffers[i].getLength();
+    }
     return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
-        null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()));
+        null, addrsSizes, opts.timeUnit().typeId.getNativeId()));
   }
 
+  /**
+   * Read parquet formatted data.
+   * @param opts various parquet parsing options.
+   * @param ds custom datasource to provide the Parquet file data
+   * @return the data parsed as a table on the GPU.
+   */
   public static Table readParquet(ParquetOptions opts, DataSource ds) {
     long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds);
     try {
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 9ff43feeac6..bd1714aa476 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -156,8 +156,9 @@ add_library(
   src/ScalarJni.cpp
   src/TableJni.cpp
   src/aggregation128_utils.cu
-  src/maps_column_view.cu
   src/check_nvcomp_output_sizes.cu
+  src/maps_column_view.cu
+  src/multi_host_buffer_source.cpp
 )
 
 # Disable NVTX if necessary
diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp
new file mode 100644
index 00000000000..2aedb2321e4
--- /dev/null
+++ b/java/src/main/native/include/multi_host_buffer_source.hpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "jni_utils.hpp"
+
+#include <cudf/io/datasource.hpp>
+
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+/**
+ * @brief A custom datasource providing data from an array of host memory buffers.
+ */
+class multi_host_buffer_source : public cudf::io::datasource {
+  std::vector<uint8_t const*> addrs_;
+  std::vector<size_t> offsets_;
+
+  size_t locate_offset_index(size_t offset);
+
+ public:
+  explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes);
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
+  bool supports_device_read() const override { return true; }
+  bool is_device_read_preferred(size_t size) const override { return true; }
+  std::unique_ptr<buffer> device_read(size_t offset,
+                                      size_t size,
+                                      rmm::cuda_stream_view stream) override;
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override;
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override;
+  size_t size() const override { return offsets_.back(); }
+};
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index cf04a87262f..4967e0b2b04 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/orc.hpp>
@@ -36,7 +37,7 @@ extern "C" {
 
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL
+JNIEXPORT jlongArray JNICALL
 Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jclass,
                                                 jlong chunk_read_limit,
@@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
                                                 jobjectArray filter_col_names,
                                                 jbooleanArray j_col_binary_read,
                                                 jstring inp_file_path,
-                                                jlong buffer,
-                                                jlong buffer_length,
+                                                jlongArray addrs_sizes,
                                                 jint unit)
 {
-  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr);
   bool read_buffer = true;
-  if (buffer == 0) {
-    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
+  if (addrs_sizes == nullptr) {
+    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(
-      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env,
+                  cudf::jni::ILLEGAL_ARG_CLASS,
+                  "Cannot pass in both buffers and an inp_file_path",
+                  nullptr);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                            static_cast<std::size_t>(buffer_length))
-                                    : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
                              .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
                              .build();
-
-    return reinterpret_cast<jlong>(
+    n_addrs_sizes.cancel();
+    n_col_binary_read.cancel();
+    auto reader_handle = reinterpret_cast<jlong>(
       new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
                                            static_cast<std::size_t>(pass_read_limit),
                                            read_opts));
+    cudf::jni::native_jlongArray result(env, 2);
+    result[0] = reader_handle;
+    result[1] = cudf::jni::release_as_jlong(multi_buffer_source);
+    return result.get_jArray();
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT jlong JNICALL
@@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource(
+  JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    delete reinterpret_cast<cudf::jni::multi_host_buffer_source*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 //
 // Chunked ORC reader JNI
 //
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 1f8b1ea207d..a6c7ae9ba18 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include "jni_compiled_expr.hpp"
 #include "jni_utils.hpp"
 #include "jni_writer_data_sink.hpp"
+#include "multi_host_buffer_source.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
@@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
                                                                    jobjectArray filter_col_names,
                                                                    jbooleanArray j_col_binary_read,
                                                                    jstring inputfilepath,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
+                                                                   jlongArray addrs_and_sizes,
                                                                    jint unit)
 {
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
-  if (buffer == 0) {
+  if (addrs_and_sizes == nullptr) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
     JNI_THROW_NEW(
       env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
-  } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
 
   try {
@@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
-
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
-                                                      static_cast<std::size_t>(buffer_length))
-                              : cudf::io::source_info(filename.get());
+    cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes);
+    std::unique_ptr<cudf::io::datasource> multi_buffer_source;
+    cudf::io::source_info source;
+    if (read_buffer) {
+      multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes));
+      source = cudf::io::source_info(multi_buffer_source.get());
+    } else {
+      source = cudf::io::source_info(filename.get());
+    }
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
       builder.convert_strings_to_categories(false)
         .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
         .build();
-    return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
+    auto tbl = cudf::io::read_parquet(opts).tbl;
+    n_col_binary_read.cancel();
+    n_addrs_sizes.cancel();
+    return convert_table_for_return(env, tbl);
   }
   CATCH_STD(env, NULL);
 }
@@ -2901,16 +2907,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap
     j_right_keys,
     compare_nulls_equal,
     [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
-      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
-                         ? cudf::nullable_join::YES
-                         : cudf::nullable_join::NO;
-      if (cudf::has_nested_columns(right)) {
-        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-        return hash.left_join();
-      } else {
-        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-        return hash.left_join();
-      }
+      cudf::distinct_hash_join hash(right, nulleq);
+      return hash.left_join(left);
     });
 }
 
@@ -3119,22 +3117,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa
     j_right_keys,
     compare_nulls_equal,
     [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
-      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
-                         ? cudf::nullable_join::YES
-                         : cudf::nullable_join::NO;
-      std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-        maps;
-      if (cudf::has_nested_columns(right)) {
-        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-        maps = hash.inner_join();
-      } else {
-        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-        maps = hash.inner_join();
-      }
-      // Unique join returns {right map, left map} but all the other joins
-      // return {left map, right map}. Swap here to make it consistent.
-      return std::make_pair(std::move(maps.second), std::move(maps.first));
+      cudf::distinct_hash_join hash(right, nulleq);
+      return hash.inner_join(left);
     });
 }
 
diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp
new file mode 100644
index 00000000000..c577fc680ba
--- /dev/null
+++ b/java/src/main/native/src/multi_host_buffer_source.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "multi_host_buffer_source.hpp"
+
+#include <algorithm>
+#include <cstring>
+#include <sstream>
+#include <vector>
+
+namespace cudf {
+namespace jni {
+
+multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes)
+{
+  if (addrs_sizes.size() % 2 != 0) {
+    throw std::logic_error("addrs_sizes length not a multiple of 2");
+  }
+  auto count = addrs_sizes.size() / 2;
+  addrs_.reserve(count);
+  offsets_.reserve(count + 1);
+  size_t total_size = 0;
+  for (int i = 0; i < addrs_sizes.size(); i += 2) {
+    addrs_.push_back(reinterpret_cast<uint8_t const*>(addrs_sizes[i]));
+    offsets_.push_back(total_size);
+    total_size += addrs_sizes[i + 1];
+  }
+  offsets_.push_back(total_size);
+}
+
+size_t multi_host_buffer_source::locate_offset_index(size_t offset)
+{
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto start = offsets_.begin();
+  auto it    = std::upper_bound(start, offsets_.end(), offset);
+  return (it - start) - 1;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::host_read(size_t offset,
+                                                                                  size_t size)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  auto const end_offset = offset + size;
+  if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto next_offset  = offsets_[buffer_index + 1];
+  if (end_offset <= next_offset) {
+    // read range hits only a single buffer, so return a zero-copy view of the data
+    auto src = addrs_[buffer_index] + offset - offsets_[buffer_index];
+    return std::make_unique<non_owning_buffer>(src, size);
+  }
+  auto buf        = std::vector<uint8_t>(size);
+  auto bytes_read = host_read(offset, size, buf.data());
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected host read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<std::vector<uint8_t>>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    std::memcpy(dst, src, copy_size);
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::unique_ptr<cudf::io::datasource::buffer> multi_host_buffer_source::device_read(
+  size_t offset, size_t size, rmm::cuda_stream_view stream)
+{
+  rmm::device_buffer buf(size, stream);
+  auto dst        = static_cast<uint8_t*>(buf.data());
+  auto bytes_read = device_read(offset, size, dst, stream);
+  if (bytes_read != size) {
+    std::stringstream ss;
+    ss << "Expected device read of " << size << " found " << bytes_read;
+    throw std::logic_error(ss.str());
+  }
+  return std::make_unique<owning_buffer<rmm::device_buffer>>(std::move(buf));
+}
+
+size_t multi_host_buffer_source::device_read(size_t offset,
+                                             size_t size,
+                                             uint8_t* dst,
+                                             rmm::cuda_stream_view stream)
+{
+  if (size == 0) { return 0; }
+  if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); }
+  if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); }
+  auto buffer_index = locate_offset_index(offset);
+  auto bytes_left   = size;
+  while (bytes_left > 0) {
+    auto next_offset   = offsets_[buffer_index + 1];
+    auto buffer_left   = next_offset - offset;
+    auto buffer_offset = offset - offsets_[buffer_index];
+    auto src           = addrs_[buffer_index] + buffer_offset;
+    auto copy_size     = std::min(buffer_left, bytes_left);
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value()));
+    offset += copy_size;
+    dst += copy_size;
+    bytes_left -= copy_size;
+    ++buffer_index;
+  }
+  return size;
+}
+
+std::future<size_t> multi_host_buffer_source::device_read_async(size_t offset,
+                                                                size_t size,
+                                                                uint8_t* dst,
+                                                                rmm::cuda_stream_view stream)
+{
+  std::promise<size_t> p;
+  p.set_value(device_read(offset, size, dst, stream));
+  return p.get_future();
+}
+
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index c7fcb1756b6..7eb32892bad 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,8 +47,11 @@
 import java.math.BigInteger;
 import java.math.RoundingMode;
 import java.nio.ByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.SeekableByteChannel;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
+import java.nio.file.StandardOpenOption;
 import java.util.*;
 import java.util.function.Function;
 import java.util.stream.Collectors;
@@ -1714,6 +1717,42 @@ void testChunkedReadParquet() {
     }
   }
 
+  @Test
+  void testChunkedReadParquetHostBuffers() throws Exception {
+    long size = TEST_PARQUET_FILE_CHUNKED_READ.length();
+    java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath();
+    try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2);
+         HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) {
+      try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) {
+        ByteBuffer bb1 = buf1.asByteBuffer();
+        while (bb1.hasRemaining()) {
+          if (channel.read(bb1) == -1) {
+            throw new EOFException("error reading first buffer");
+          }
+        }
+        ByteBuffer bb2 = buf2.asByteBuffer();
+        while (bb2.hasRemaining()) {
+          if (channel.read(bb2) == -1) {
+            throw new EOFException("error reading second buffer");
+          }
+        }
+      }
+      ParquetOptions opts = ParquetOptions.DEFAULT;
+      try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while(reader.hasNext()) {
+          ++numChunks;
+          try(Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(2, numChunks);
+        assertEquals(40000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testChunkedReadParquetFromDataSource() throws IOException {
     try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ);
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 40bd50acf16..fd6d0257940 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -260,26 +260,3 @@ cdef class DeviceScalar:
             self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[
                 <underlying_type_t_type_id>(cdtype_id)
             ]
-
-
-def as_device_scalar(val, dtype=None):
-    if isinstance(val, (cudf.Scalar, DeviceScalar)):
-        if dtype == val.dtype or dtype is None:
-            if isinstance(val, DeviceScalar):
-                return val
-            else:
-                return val.device_value
-        else:
-            raise TypeError("Can't update dtype of existing GPU scalar")
-    else:
-        return cudf.Scalar(val, dtype=dtype).device_value
-
-
-def _is_null_host_scalar(slr):
-    if cudf.utils.utils.is_na_like(slr):
-        return True
-    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
-            slr is pd.NaT:
-        return True
-    else:
-        return False
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index b10b8dfe207..d705b4d4c21 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -621,7 +621,7 @@ def ordered(self) -> bool:
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
-        ) and cudf._lib.scalar._is_null_host_scalar(value):
+        ) and cudf.utils.utils._is_null_host_scalar(value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 31efe267c96..24b657f1c32 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -25,7 +25,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.column import Column
-from cudf._lib.scalar import as_device_scalar
 from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -71,7 +70,7 @@
     min_signed_type,
     min_unsigned_type,
 )
-from cudf.utils.utils import _array_ufunc, mask_dtype
+from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype
 
 if TYPE_CHECKING:
     import builtins
@@ -777,9 +776,7 @@ def fillna(
         if not self.has_nulls(include_nan=True):
             return self.copy()
         elif method is None:
-            if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
-                fill_value
-            ):
+            if is_scalar(fill_value) and _is_null_host_scalar(fill_value):
                 return self.copy()
             else:
                 fill_value = self._validate_fillna_value(fill_value)
@@ -1984,12 +1981,12 @@ def as_column(
             column = Column.from_pylibcudf(
                 plc.filling.sequence(
                     len(arbitrary),
-                    as_device_scalar(
+                    cudf.Scalar(
                         arbitrary.start, dtype=np.dtype(np.int64)
-                    ).c_value,
-                    as_device_scalar(
+                    ).device_value.c_value,
+                    cudf.Scalar(
                         arbitrary.step, dtype=np.dtype(np.int64)
-                    ).c_value,
+                    ).device_value.c_value,
                 )
             )
         if cudf.get_option("default_integer_bitwidth") and dtype is None:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 3d9440cdf21..6283e498842 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -236,7 +236,7 @@ def from_sequences(
 
         # Build Data, Mask & Offsets
         for data in arbitrary:
-            if cudf._lib.scalar._is_null_host_scalar(data):
+            if cudf.utils.utils._is_null_host_scalar(data):
                 mask_col.append(False)
                 offset_vals.append(offset)
             else:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 4405e153b0c..8fe5299fcdd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -151,7 +151,7 @@ def __setitem__(self, key: Any, value: Any):
             cudf.Scalar(
                 value,
                 dtype=self.dtype
-                if cudf._lib.scalar._is_null_host_scalar(value)
+                if cudf.utils.utils._is_null_host_scalar(value)
                 else None,
             )
             if is_scalar(value)
@@ -789,7 +789,7 @@ def _normalize_find_and_replace_input(
         )
         # Scalar case
         if len(col_to_normalize) == 1:
-            if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]):
+            if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]):
                 return normalized_column.astype(input_column_dtype)
             if np.isinf(col_to_normalize[0]):
                 return normalized_column
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 417fa99dac0..749ab8e837a 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -1,9 +1,10 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 import datetime
 import functools
+import math
 from typing import TYPE_CHECKING, cast
 
 import numpy as np
@@ -263,7 +264,15 @@ def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
     def total_seconds(self) -> ColumnBase:
-        raise NotImplementedError("total_seconds is currently not implemented")
+        conversion = _unit_to_nanoseconds_conversion[self.time_unit] / 1e9
+        # Typecast to decimal128 to avoid floating point precision issues
+        # https://github.com/rapidsai/cudf/issues/17664
+        return (
+            (self.astype("int64") * conversion)
+            .astype(cudf.Decimal128Dtype(38, 9))
+            .round(decimals=abs(int(math.log10(conversion))))
+            .astype("float64")
+        )
 
     def ceil(self, freq: str) -> ColumnBase:
         raise NotImplementedError("ceil is currently not implemented")
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3334b57ce1b..b2121511a14 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -92,7 +92,11 @@
     min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
+from cudf.utils.utils import (
+    GetAttrGetItemMixin,
+    _external_only_api,
+    _is_null_host_scalar,
+)
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnLike, Dtype, NotImplementedType
@@ -3371,7 +3375,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             if isinstance(value, (np.ndarray, cupy.ndarray)):
                 dtype = value.dtype
                 value = value.item()
-            if libcudf.scalar._is_null_host_scalar(value):
+            if _is_null_host_scalar(value):
                 dtype = "str"
             value = as_column(
                 value,
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4137109cc96..6ae524d6346 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import copy
@@ -49,7 +49,7 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import Generator, Hashable, Iterable
 
     from cudf._typing import (
         AggType,
@@ -2448,7 +2448,7 @@ def _cov_or_corr(self, func, method_name):
         # create expanded dataframe consisting all combinations of the
         # struct columns-pairs to be used in the correlation or covariance
         # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2'))
-        column_names = self.grouping.values._column_names
+        column_names = self.grouping._values_column_names
         num_cols = len(column_names)
 
         column_pair_structs = {}
@@ -2682,10 +2682,8 @@ def diff(self, periods=1, axis=0):
 
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
-
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values - self.shift(periods=periods)
 
     def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
@@ -2789,9 +2787,8 @@ def fillna(
                 raise ValueError("Method can only be of 'ffill', 'bfill'.")
             return getattr(self, method, limit)()
 
-        values = self.obj.__class__._from_data(
-            self.grouping.values._data, self.obj.index
-        )
+        values = self.grouping.values
+        values.index = self.obj.index
         return values.fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
@@ -3543,6 +3540,13 @@ def keys(self):
                 self._key_columns[0], name=self.names[0]
             )
 
+    @property
+    def _values_column_names(self) -> list[Hashable]:
+        # If the key columns are in `obj`, filter them out
+        return [
+            x for x in self._obj._column_names if x not in self._named_columns
+        ]
+
     @property
     def values(self) -> cudf.core.frame.Frame:
         """Return value columns as a frame.
@@ -3553,11 +3557,9 @@ def values(self) -> cudf.core.frame.Frame:
 
         This is mainly used in transform-like operations.
         """
-        # If the key columns are in `obj`, filter them out
-        value_column_names = [
-            x for x in self._obj._column_names if x not in self._named_columns
-        ]
-        value_columns = self._obj._data.select_by_label(value_column_names)
+        value_columns = self._obj._data.select_by_label(
+            self._values_column_names
+        )
         return self._obj.__class__._from_data(value_columns)
 
     def _handle_callable(self, by):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index eac5b9d71ae..85be8d21d27 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -842,14 +842,14 @@ def sort_values(
     @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._column.take(gather_map, nullify, check_bounds),
             name=self.name,
         )
 
     @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._column.apply_boolean_mask(boolean_mask), name=self.name
         )
 
@@ -857,7 +857,7 @@ def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return cudf.Index._from_column(
+        return Index._from_column(
             self._as_int_index()._split(splits), name=self.name
         )
 
@@ -1657,7 +1657,7 @@ def _clean_nulls_from_index(self) -> Index:
                 if isinstance(self, (DatetimeIndex, TimedeltaIndex))
                 else str(cudf.NA)
             )
-            return cudf.Index._from_column(
+            return Index._from_column(
                 self._column.astype("str").fillna(fill_value),
                 name=self.name,
             )
@@ -2964,13 +2964,13 @@ def median(self, *, skipna: bool = True, axis: int | None = 0):
     def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
         return self._column.std(skipna=skipna, ddof=ddof)
 
-    def total_seconds(self) -> cupy.ndarray:
+    def total_seconds(self) -> Index:
         """
         Return total duration of each element expressed in seconds.
 
         This method is currently not implemented.
         """
-        return self._column.total_seconds().values
+        return Index._from_column(self._column.total_seconds(), name=self.name)
 
     def ceil(self, freq: str) -> Self:
         """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 6854cb02aa5..e9ed74f804b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 """Base class for Frame types that have an index."""
 
 from __future__ import annotations
@@ -2836,16 +2836,22 @@ def hash_values(
 
         Parameters
         ----------
-        method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3'
+        method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3'
             Hash function to use:
 
             * murmur3: MurmurHash3 hash function
-            * md5: MD5 hash function
+            * xxhash32: xxHash32 hash function
             * xxhash64: xxHash64 hash function
+            * md5: MD5 hash function
+            * sha1: SHA-1 hash function
+            * sha224: SHA-224 hash function
+            * sha256: SHA-256 hash function
+            * sha384: SHA-384 hash function
+            * sha512: SHA-512 hash function
 
         seed : int, optional
             Seed value to use for the hash function. This parameter is only
-            supported for 'murmur3' and 'xxhash64'.
+            supported for 'murmur3', 'xxhash32', and 'xxhash64'.
 
 
         Returns
@@ -2900,7 +2906,7 @@ def hash_values(
         2    fe061786ea286a515b772d91b0dfcd70
         dtype: object
         """
-        seed_hash_methods = {"murmur3", "xxhash64"}
+        seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"}
         if seed is None:
             seed = 0
         elif method not in seed_hash_methods:
@@ -2914,6 +2920,8 @@ def hash_values(
             )
             if method == "murmur3":
                 plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed)
+            elif method == "xxhash32":
+                plc_column = plc.hashing.xxhash_32(plc_table, seed)
             elif method == "xxhash64":
                 plc_column = plc.hashing.xxhash_64(plc_table, seed)
             elif method == "md5":
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 80dd0921f9c..7d246960cc9 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -178,13 +178,13 @@ def dtype(self):
     def is_valid(self):
         if not self._is_host_value_current:
             self._device_value_to_host()
-        return not cudf._lib.scalar._is_null_host_scalar(self._host_value)
+        return not cudf.utils.utils._is_null_host_scalar(self._host_value)
 
     def _device_value_to_host(self):
         self._host_value = self._device_value._to_host_scalar()
 
     def _preprocess_host_value(self, value, dtype):
-        valid = not cudf._lib.scalar._is_null_host_scalar(value)
+        valid = not cudf.utils.utils._is_null_host_scalar(value)
 
         if isinstance(value, list):
             if dtype is not None:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 961e5e11bc0..49c2c8cf387 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -5183,6 +5183,66 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
+    def total_seconds(self) -> Series:
+        """
+        Return total duration of each element expressed in seconds.
+
+        This method is available directly on TimedeltaIndex
+        and on Series containing timedelta values under the ``.dt`` namespace.
+
+        Returns
+        -------
+        Index or Series
+            When the calling object is a TimedeltaIndex,
+            the return type is an Index with a float64 dtype. When the calling object
+            is a Series, the return type is Series of type `float64` whose
+            index is the same as the original.
+
+        See Also
+        --------
+        datetime.timedelta.total_seconds : Standard library version
+            of this method.
+        TimedeltaIndex.components : Return a DataFrame with components of
+            each Timedelta.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> import pandas as pd
+        >>> import numpy as np
+        >>> s = cudf.Series(pd.to_timedelta(np.arange(5), unit="D"))
+        >>> s
+        0    0 days 00:00:00
+        1    1 days 00:00:00
+        2    2 days 00:00:00
+        3    3 days 00:00:00
+        4    4 days 00:00:00
+        dtype: timedelta64[ns]
+
+        >>> s.dt.total_seconds()
+        0         0.0
+        1     86400.0
+        2    172800.0
+        3    259200.0
+        4    345600.0
+        dtype: float64
+
+        **TimedeltaIndex**
+
+        >>> idx = cudf.from_pandas(pd.to_timedelta(np.arange(5), unit="D"))
+        >>> idx
+        TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'],
+                       dtype='timedelta64[ns]', freq=None)
+
+        >>> idx.total_seconds()
+        Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64')
+        """
+        return self._return_result_like_self(
+            self.series._column.total_seconds()
+        )
+
 
 @_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc
new file mode 100644
index 00000000000..a0ea4fbbfc2
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc
new file mode 100644
index 00000000000..8a7969cdbbb
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc differ
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 11a9b398b50..f3cf8e36a5b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import array as arr
 import contextlib
@@ -1440,6 +1440,7 @@ def test_assign_callable(mapping):
         "sha256",
         "sha384",
         "sha512",
+        "xxhash32",
         "xxhash64",
     ],
 )
@@ -1447,6 +1448,7 @@ def test_assign_callable(mapping):
 def test_dataframe_hash_values(nrows, method, seed):
     warning_expected = seed is not None and method not in {
         "murmur3",
+        "xxhash32",
         "xxhash64",
     }
     potential_warning = (
@@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         "sha256": object,
         "sha384": object,
         "sha512": object,
+        "xxhash32": np.uint32,
         "xxhash64": np.uint64,
     }
     assert out.dtype == expected_dtypes[method]
@@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed):
         assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one)
 
 
-@pytest.mark.parametrize("method", ["murmur3", "xxhash64"])
+@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"])
 def test_dataframe_hash_values_seed(method):
     gdf = cudf.DataFrame()
     data = np.arange(10)
@@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method):
     assert_neq(out_one, out_two)
 
 
+def test_dataframe_hash_values_xxhash32():
+    # xxhash32 has no built-in implementation in Python and we don't want to
+    # add a testing dependency, so we use regression tests against known good
+    # values.
+    gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]})
+    gdf["b"] = -gdf["a"]
+    out_a = gdf["a"].hash_values(method="xxhash32", seed=0)
+    expected_a = cudf.Series(
+        [3736311059, 2307980487, 2906647130, 746578903, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_a, expected_a)
+
+    out_b = gdf["b"].hash_values(method="xxhash32", seed=42)
+    expected_b = cudf.Series(
+        [1076387279, 2261349915, 531498073, 650869264, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_b, expected_b)
+
+    out_df = gdf.hash_values(method="xxhash32", seed=0)
+    expected_df = cudf.Series(
+        [1223721700, 2885793241, 1920811472, 1146715602, 4294967295],
+        dtype=np.uint32,
+    )
+    assert_eq(out_df, expected_df)
+
+
 def test_dataframe_hash_values_xxhash64():
     # xxhash64 has no built-in implementation in Python and we don't want to
     # add a testing dependency, so we use regression tests against known good
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index c4b4ef60184..fe143e66407 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import datetime
 import decimal
@@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir):
     got = cudf.read_orc(buffer)
 
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize(
+    "inputfile",
+    [
+        "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc",
+        "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc",
+    ],
+)
+def test_orc_reader_desynced_timestamp(datadir, inputfile):
+    # Test a special case where the DATA stream (second) in a TIMESTAMP column
+    # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row
+    # group. In this case, the "run cache manager" in the decoder kernel is used to
+    # orchestrate the dual-stream processing.
+    # For more information, see https://github.com/rapidsai/cudf/issues/17155.
+
+    path = datadir / inputfile
+
+    expect = pd.read_orc(path)
+    got = cudf.read_orc(path)
+
+    assert_frame_equal(cudf.from_pandas(expect), got)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index d622ff6b94e..f1da2a060ec 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import datetime
 import operator
@@ -1506,3 +1506,25 @@ def test_tdi_unit():
     result = pd_tdi.unit
     expected = cudf_tdi.unit
     assert result == expected
+
+
+@pytest.mark.parametrize("data", _TIMEDELTA_DATA)
+@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
+def test_timedelta_series_total_seconds(data, dtype):
+    gsr = cudf.Series(data, dtype=dtype)
+    psr = gsr.to_pandas()
+
+    expected = psr.dt.total_seconds()
+    actual = gsr.dt.total_seconds()
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", _TIMEDELTA_DATA)
+@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES)
+def test_timedelta_index_total_seconds(request, data, dtype):
+    gi = cudf.Index(data, dtype=dtype)
+    pi = gi.to_pandas()
+
+    expected = pi.total_seconds()
+    actual = gi.total_seconds()
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index ca8f9cac2d0..31a8f4de3b3 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -198,7 +198,7 @@ def to_cudf_compatible_scalar(val, dtype=None):
     If `val` is None, returns None.
     """
 
-    if cudf._lib.scalar._is_null_host_scalar(val) or isinstance(
+    if cudf.utils.utils._is_null_host_scalar(val) or isinstance(
         val, cudf.Scalar
     ):
         return val
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c83c1cbe895..0adaaa60654 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -341,6 +341,15 @@ def is_na_like(obj):
     return obj is None or obj is cudf.NA or obj is cudf.NaT
 
 
+def _is_null_host_scalar(slr) -> bool:
+    # slr is NA like or NaT like
+    return (
+        is_na_like(slr)
+        or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr))
+        or slr is pd.NaT
+    )
+
+
 def _warn_no_dask_cudf(fn):
     @functools.wraps(fn)
     def wrapper(self):
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
index b88b109a975..92f39abe71e 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py
@@ -91,7 +91,7 @@ def __init__(
             op = partial(self._reduce, request=req)
         elif name in {"min", "max"}:
             op = partial(op, propagate_nans=options)
-        elif name in {"count", "first", "last"}:
+        elif name in {"count", "sum", "first", "last"}:
             pass
         else:
             raise NotImplementedError(
@@ -180,6 +180,18 @@ def _count(self, column: Column) -> Column:
             )
         )
 
+    def _sum(self, column: Column) -> Column:
+        if column.obj.size() == 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(0, type=plc.interop.to_arrow(self.dtype))
+                    ),
+                    1,
+                )
+            )
+        return self._reduce(column, request=plc.aggregation.sum())
+
     def _min(self, column: Column, *, propagate_nans: bool) -> Column:
         if propagate_nans and column.nan_count > 0:
             return Column(
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 86cb2352dcc..15ad845ea78 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -148,3 +148,9 @@ def test_agg_singleton(op):
     q = df.select(op(pl.col("a")))
 
     assert_gpu_result_equal(q)
+
+
+def test_sum_empty_zero():
+    df = pl.LazyFrame({"a": pl.Series(values=[], dtype=pl.Int32())})
+    q = df.select(pl.col("a").sum())
+    assert_gpu_result_equal(q)
diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index 20eb2404b77..863102103ed 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -1,7 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import warnings
-from importlib import import_module
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import dask.dataframe as dd
 from dask import config
@@ -9,11 +6,16 @@
 
 import cudf
 
-from . import backends  # noqa: F401
+from . import backends, io  # noqa: F401
+from ._expr.expr import _patch_dask_expr
 from ._version import __git_commit__, __version__  # noqa: F401
-from .core import DataFrame, Index, Series, concat, from_cudf
+from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf
 
-QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED
+if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()):
+    raise ValueError(
+        "The legacy DataFrame API is not supported in dask_cudf>24.12. "
+        "Please enable query-planning, or downgrade to dask_cudf<=24.12"
+    )
 
 
 def read_csv(*args, **kwargs):
@@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs):
         return dd.read_parquet(*args, **kwargs)
 
 
-def _deprecated_api(old_api, new_api=None, rec=None):
-    def inner_func(*args, **kwargs):
-        if new_api:
-            # Use alternative
-            msg = f"{old_api} is now deprecated. "
-            msg += rec or f"Please use {new_api} instead."
-            warnings.warn(msg, FutureWarning)
-            new_attr = new_api.split(".")
-            module = import_module(".".join(new_attr[:-1]))
-            return getattr(module, new_attr[-1])(*args, **kwargs)
-
-        # No alternative - raise an error
-        raise NotImplementedError(
-            f"{old_api} is no longer supported. " + (rec or "")
-        )
-
-    return inner_func
-
-
-if QUERY_PLANNING_ON:
-    from . import io
-    from ._expr.expr import _patch_dask_expr
-
-    groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
-    read_text = DataFrame.read_text
-    _patch_dask_expr()
-
-else:
-    from . import io  # noqa: F401
-    from ._legacy.groupby import groupby_agg  # noqa: F401
-    from ._legacy.io import read_text  # noqa: F401
-
-
+groupby_agg = _deprecated_api("dask_cudf.groupby_agg")
+read_text = DataFrame.read_text
 to_orc = _deprecated_api(
     "dask_cudf.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.to_orc",
     rec="Please use DataFrame.to_orc instead.",
 )
 
 
+_patch_dask_expr()
+
+
 __all__ = [
     "DataFrame",
     "Index",
diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py
index 5192e6b8171..e8c9a970b7b 100644
--- a/python/dask_cudf/dask_cudf/_expr/collection.py
+++ b/python/dask_cudf/dask_cudf/_expr/collection.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import warnings
 from functools import cached_property
@@ -15,19 +15,11 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.dataframe.dispatch import get_parallel_type
 from dask.typing import no_default
 
 import cudf
 
-_LEGACY_WORKAROUND = (
-    "To enable the 'legacy' dask-cudf API, set the "
-    "global 'dataframe.query-planning' config to "
-    "`False` before dask is imported. This can also "
-    "be done by setting an environment variable: "
-    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
-)
-
-
 ##
 ## Custom collection classes
 ##
@@ -103,9 +95,8 @@ def set_index(
             divisions = None
             warnings.warn(
                 "Ignoring divisions='quantile'. This option is now "
-                "deprecated. Please use the legacy API and raise an "
-                "issue on github if this feature is necessary."
-                f"\n{_LEGACY_WORKAROUND}",
+                "deprecated. Please raise an issue on github if this "
+                "feature is necessary.",
                 FutureWarning,
             )
 
@@ -135,9 +126,7 @@ def groupby(
 
             if kwargs.pop("as_index") is not True:
                 raise NotImplementedError(
-                    f"{msg} Please reset the index after aggregating, or "
-                    "use the legacy API if `as_index=False` is required.\n"
-                    f"{_LEGACY_WORKAROUND}"
+                    f"{msg} Please reset the index after aggregating."
                 )
             else:
                 warnings.warn(msg, FutureWarning)
@@ -153,15 +142,15 @@ def groupby(
         )
 
     def to_orc(self, *args, **kwargs):
-        from dask_cudf._legacy.io import to_orc
+        from dask_cudf.io.orc import to_orc as to_orc_impl
 
-        return to_orc(self, *args, **kwargs)
+        return to_orc_impl(self, *args, **kwargs)
 
     @staticmethod
     def read_text(*args, **kwargs):
-        from dask_cudf._legacy.io.text import read_text as legacy_read_text
+        from dask_cudf.io.text import read_text as read_text_impl
 
-        return legacy_read_text(*args, **kwargs)
+        return read_text_impl(*args, **kwargs)
 
     def clip(self, lower=None, upper=None, axis=1):
         if axis not in (None, 1):
@@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
+# dask.dataframe dispatch
+get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
+get_parallel_type.register(cudf.Series, lambda _: Series)
+get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+# dask_expr dispatch (might go away?)
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py
index 8b91e53604c..03d1da0d258 100644
--- a/python/dask_cudf/dask_cudf/_expr/expr.py
+++ b/python/dask_cudf/dask_cudf/_expr/expr.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 import functools
 
 import dask_expr._shuffle as _shuffle_module
@@ -7,13 +7,13 @@
 from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns
 from dask_expr._reductions import Reduction, Var
 
-from dask.dataframe.core import (
-    is_dataframe_like,
+from dask.dataframe.dispatch import (
+    is_categorical_dtype,
     make_meta,
     meta_nonempty,
 )
-from dask.dataframe.dispatch import is_categorical_dtype
 from dask.typing import no_default
+from dask.utils import is_dataframe_like
 
 import cudf
 
diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py
index 0242fac6e72..a5cdd43169b 100644
--- a/python/dask_cudf/dask_cudf/_expr/groupby.py
+++ b/python/dask_cudf/dask_cudf/_expr/groupby.py
@@ -1,6 +1,7 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2025, NVIDIA CORPORATION.
 import functools
 
+import numpy as np
 import pandas as pd
 from dask_expr._collection import new_collection
 from dask_expr._groupby import (
@@ -16,11 +17,262 @@
 from dask.dataframe.groupby import Aggregation
 
 from cudf.core.groupby.groupby import _deprecate_collect
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 ##
 ## Fused groupby aggregations
 ##
 
+OPTIMIZED_AGGS = (
+    "count",
+    "mean",
+    "std",
+    "var",
+    "sum",
+    "min",
+    "max",
+    list,
+    "first",
+    "last",
+)
+
+
+def _make_name(col_name, sep="_"):
+    """Combine elements of `col_name` into a single string, or no-op if
+    `col_name` is already a string
+    """
+    if isinstance(col_name, str):
+        return col_name
+    return sep.join(name for name in col_name if name != "")
+
+
+@_dask_cudf_performance_tracking
+def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
+    """Initial partition-level aggregation task.
+
+    This is the first operation to be executed on each input
+    partition in `groupby_agg`.  Depending on `aggs`, four possible
+    groupby aggregations ("count", "sum", "min", and "max") are
+    performed.  The result is then partitioned (by hashing `gb_cols`)
+    into a number of distinct dictionary elements.  The number of
+    elements in the output dictionary (`split_out`) corresponds to
+    the number of partitions in the final output of `groupby_agg`.
+    """
+
+    # Modify dict for initial (partition-wise) aggregations
+    _agg_dict = {}
+    for col, agg_list in aggs.items():
+        _agg_dict[col] = set()
+        for agg in agg_list:
+            if agg in ("mean", "std", "var"):
+                _agg_dict[col].add("count")
+                _agg_dict[col].add("sum")
+            else:
+                _agg_dict[col].add(agg)
+        _agg_dict[col] = list(_agg_dict[col])
+        if set(agg_list).intersection({"std", "var"}):
+            pow2_name = _make_name((col, "pow2"), sep=sep)
+            df[pow2_name] = df[col].astype("float64").pow(2)
+            _agg_dict[pow2_name] = ["sum"]
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        _agg_dict
+    )
+    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _tree_node_agg(df, gb_cols, dropna, sort, sep):
+    """Node in groupby-aggregation reduction tree.
+
+    The input DataFrame (`df`) corresponds to the
+    concatenated output of one or more `_groupby_partition_agg`
+    tasks. In this function, "sum", "min" and/or "max" groupby
+    aggregations will be used to combine the statistics for
+    duplicate keys.
+    """
+
+    agg_dict = {}
+    for col in df.columns:
+        if col in gb_cols:
+            continue
+        agg = col.split(sep)[-1]
+        if agg in ("count", "sum"):
+            agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
+        elif agg in OPTIMIZED_AGGS:
+            agg_dict[col] = [agg]
+        else:
+            raise ValueError(f"Unexpected aggregation: {agg}")
+
+    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
+        agg_dict
+    )
+
+    # Don't include the last aggregation in the column names
+    output_columns = [
+        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
+        for name in gb.columns
+    ]
+    gb.columns = output_columns
+    # Return with deterministic column ordering
+    return gb[sorted(output_columns)]
+
+
+@_dask_cudf_performance_tracking
+def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
+    """Calculate variance (given count, sum, and sum-squared columns)."""
+
+    # Select count, sum, and sum-squared
+    n = df[count_name]
+    x = df[sum_name]
+    x2 = df[pow2_sum_name]
+
+    # Use sum-squared approach to get variance
+    var = x2 - x**2 / n
+    div = n - ddof
+    div[div < 1] = 1  # Avoid division by 0
+    var /= div
+
+    # Set appropriate NaN elements
+    # (since we avoided 0-division)
+    var[(n - ddof) == 0] = np.nan
+
+    return var
+
+
+@_dask_cudf_performance_tracking
+def _finalize_gb_agg(
+    gb_in,
+    gb_cols,
+    aggs,
+    columns,
+    final_columns,
+    as_index,
+    dropna,
+    sort,
+    sep,
+    str_cols_out,
+    aggs_renames,
+):
+    """Final aggregation task.
+
+    This is the final operation on each output partitions
+    of the `groupby_agg` algorithm.  This function must
+    take care of higher-order aggregations, like "mean",
+    "std" and "var".  We also need to deal with the column
+    index, the row index, and final sorting behavior.
+    """
+
+    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
+
+    # Deal with higher-order aggregations
+    for col in columns:
+        agg_list = aggs.get(col, [])
+        agg_set = set(agg_list)
+        if agg_set.intersection({"mean", "std", "var"}):
+            count_name = _make_name((col, "count"), sep=sep)
+            sum_name = _make_name((col, "sum"), sep=sep)
+            if agg_set.intersection({"std", "var"}):
+                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
+                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
+                if "var" in agg_list:
+                    name_var = _make_name((col, "var"), sep=sep)
+                    gb[name_var] = var
+                if "std" in agg_list:
+                    name_std = _make_name((col, "std"), sep=sep)
+                    gb[name_std] = np.sqrt(var)
+                gb.drop(columns=[pow2_sum_name], inplace=True)
+            if "mean" in agg_list:
+                mean_name = _make_name((col, "mean"), sep=sep)
+                gb[mean_name] = gb[sum_name] / gb[count_name]
+            if "sum" not in agg_list:
+                gb.drop(columns=[sum_name], inplace=True)
+            if "count" not in agg_list:
+                gb.drop(columns=[count_name], inplace=True)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
+            gb[collect_name] = gb[collect_name].list.concat()
+
+    # Ensure sorted keys if `sort=True`
+    if sort:
+        gb = gb.sort_values(gb_cols)
+
+    # Set index if necessary
+    if as_index:
+        gb.set_index(gb_cols, inplace=True)
+
+    # Unflatten column names
+    col_array = []
+    agg_array = []
+    for col in gb.columns:
+        if col in gb_cols:
+            col_array.append(col)
+            agg_array.append("")
+        else:
+            name, agg = col.split(sep)
+            col_array.append(name)
+            agg_array.append(aggs_renames.get((name, agg), agg))
+    if str_cols_out:
+        gb.columns = col_array
+    else:
+        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
+
+    return gb[final_columns]
+
+
+@_dask_cudf_performance_tracking
+def _redirect_aggs(arg):
+    """Redirect aggregations to their corresponding name in cuDF"""
+    redirects = {
+        sum: "sum",
+        max: "max",
+        min: "min",
+        "collect": list,
+        "list": list,
+    }
+    if isinstance(arg, dict):
+        new_arg = dict()
+        for col in arg:
+            if isinstance(arg[col], list):
+                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
+            elif isinstance(arg[col], dict):
+                new_arg[col] = {
+                    k: redirects.get(v, v) for k, v in arg[col].items()
+                }
+            else:
+                new_arg[col] = redirects.get(arg[col], arg[col])
+        return new_arg
+    if isinstance(arg, list):
+        return [redirects.get(agg, agg) for agg in arg]
+    return redirects.get(arg, arg)
+
+
+@_dask_cudf_performance_tracking
+def _aggs_optimized(arg, supported: set):
+    """Check that aggregations in `arg` are a subset of `supported`"""
+    if isinstance(arg, (list, dict)):
+        if isinstance(arg, dict):
+            _global_set: set[str] = set()
+            for col in arg:
+                if isinstance(arg[col], list):
+                    _global_set = _global_set.union(set(arg[col]))
+                elif isinstance(arg[col], dict):
+                    _global_set = _global_set.union(set(arg[col].values()))
+                else:
+                    _global_set.add(arg[col])
+        else:
+            _global_set = set(arg)
+
+        return bool(_global_set.issubset(supported))
+    elif isinstance(arg, (str, type)):
+        return arg in supported
+    return False
+
 
 def _get_spec_info(gb):
     if isinstance(gb.arg, (dict, list)):
@@ -105,20 +357,14 @@ def shuffle_by_index(self):
 
     @classmethod
     def chunk(cls, df, *by, **kwargs):
-        from dask_cudf._legacy.groupby import _groupby_partition_agg
-
         return _groupby_partition_agg(df, **kwargs)
 
     @classmethod
     def combine(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _tree_node_agg
-
         return _tree_node_agg(_concat(inputs), **kwargs)
 
     @classmethod
     def aggregate(cls, inputs, **kwargs):
-        from dask_cudf._legacy.groupby import _finalize_gb_agg
-
         return _finalize_gb_agg(_concat(inputs), **kwargs)
 
     @property
@@ -193,12 +439,6 @@ def _maybe_get_custom_expr(
     shuffle_method=None,
     **kwargs,
 ):
-    from dask_cudf._legacy.groupby import (
-        OPTIMIZED_AGGS,
-        _aggs_optimized,
-        _redirect_aggs,
-    )
-
     if kwargs:
         # Unsupported key-word arguments
         return None
diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py
deleted file mode 100644
index d6beb775a5e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/core.py
+++ /dev/null
@@ -1,711 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-import math
-import warnings
-
-import numpy as np
-import pandas as pd
-from tlz import partition_all
-
-from dask import dataframe as dd
-from dask.base import normalize_token, tokenize
-from dask.dataframe.core import (
-    Scalar,
-    handle_out,
-    make_meta as dask_make_meta,
-    map_partitions,
-)
-from dask.dataframe.utils import raise_on_meta_error
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname
-
-import cudf
-from cudf import _lib as libcudf
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._expr.accessors import ListMethods, StructMethods
-from dask_cudf._legacy import sorting
-from dask_cudf._legacy.sorting import (
-    _deprecate_shuffle_kwarg,
-    _get_shuffle_method,
-)
-
-
-class _Frame(dd.core._Frame, OperatorMethodMixin):
-    """Superclass for DataFrame and Series
-
-    Parameters
-    ----------
-    dsk : dict
-        The dask graph to compute this DataFrame
-    name : str
-        The key prefix that specifies which keys in the dask comprise this
-        particular DataFrame / Series
-    meta : cudf.DataFrame, cudf.Series, or cudf.Index
-        An empty cudf object with names, dtypes, and indices matching the
-        expected output.
-    divisions : tuple of index values
-        Values along which we partition our blocks on the index
-    """
-
-    def _is_partition_type(self, meta):
-        return isinstance(meta, self._partition_type)
-
-    def __repr__(self):
-        s = "<dask_cudf.%s | %d tasks | %d npartitions>"
-        return s % (type(self).__name__, len(self.dask), self.npartitions)
-
-
-normalize_token.register(_Frame, lambda a: a._name)
-
-
-class DataFrame(_Frame, dd.core.DataFrame):
-    """
-    A distributed Dask DataFrame where the backing dataframe is a
-    :class:`cuDF DataFrame <cudf:cudf.DataFrame>`.
-
-    Typically you would not construct this object directly, but rather
-    use one of Dask-cuDF's IO routines.
-
-    Most operations on :doc:`Dask DataFrames <dask:dataframe>` are
-    supported, with many of the same caveats.
-
-    """
-
-    _partition_type = cudf.DataFrame
-
-    @_dask_cudf_performance_tracking
-    def _assign_column(self, k, v):
-        def assigner(df, k, v):
-            out = df.copy()
-            out[k] = v
-            return out
-
-        meta = assigner(self._meta, k, dask_make_meta(v))
-        return self.map_partitions(assigner, k, v, meta=meta)
-
-    @_dask_cudf_performance_tracking
-    def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
-        import uuid
-
-        if kwargs is None:
-            kwargs = {}
-
-        if cache_key is None:
-            cache_key = uuid.uuid4()
-
-        def do_apply_rows(df, func, incols, outcols, kwargs):
-            return df.apply_rows(
-                func, incols, outcols, kwargs, cache_key=cache_key
-            )
-
-        meta = do_apply_rows(self._meta, func, incols, outcols, kwargs)
-        return self.map_partitions(
-            do_apply_rows, func, incols, outcols, kwargs, meta=meta
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def merge(self, other, shuffle_method=None, **kwargs):
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().merge(
-            other,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def join(self, other, shuffle_method=None, **kwargs):
-        # CuDF doesn't support "right" join yet
-        how = kwargs.pop("how", "left")
-        if how == "right":
-            return other.join(other=self, how="left", **kwargs)
-
-        on = kwargs.pop("on", None)
-        if isinstance(on, tuple):
-            on = list(on)
-        return super().join(
-            other,
-            how=how,
-            on=on,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def set_index(
-        self,
-        other,
-        sorted=False,
-        divisions=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        pre_sorted = sorted
-        del sorted
-
-        if divisions == "quantile":
-            warnings.warn(
-                "Using divisions='quantile' is now deprecated. "
-                "Please raise an issue on github if you believe "
-                "this feature is necessary.",
-                FutureWarning,
-            )
-
-        if (
-            divisions == "quantile"
-            or isinstance(divisions, (cudf.DataFrame, cudf.Series))
-            or (
-                isinstance(other, str)
-                and cudf.api.types.is_string_dtype(self[other].dtype)
-            )
-        ):
-            # Let upstream-dask handle "pre-sorted" case
-            if pre_sorted:
-                return dd.shuffle.set_sorted_index(
-                    self, other, divisions=divisions, **kwargs
-                )
-
-            by = other
-            if not isinstance(other, list):
-                by = [by]
-            if len(by) > 1:
-                raise ValueError("Dask does not support MultiIndex (yet).")
-            if divisions == "quantile":
-                divisions = None
-
-            # Use dask_cudf's sort_values
-            df = self.sort_values(
-                by,
-                max_branch=kwargs.get("max_branch", None),
-                divisions=divisions,
-                set_divisions=True,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-
-            # Ignore divisions if its a dataframe
-            if isinstance(divisions, cudf.DataFrame):
-                divisions = None
-
-            # Set index and repartition
-            df2 = df.map_partitions(
-                sorting.set_index_post,
-                index_name=other,
-                drop=kwargs.get("drop", True),
-                column_dtype=df.columns.dtype,
-            )
-            npartitions = kwargs.get("npartitions", self.npartitions)
-            partition_size = kwargs.get("partition_size", None)
-            if partition_size:
-                return df2.repartition(partition_size=partition_size)
-            if not divisions and df2.npartitions != npartitions:
-                return df2.repartition(npartitions=npartitions)
-            if divisions and df2.npartitions != len(divisions) - 1:
-                return df2.repartition(divisions=divisions)
-            return df2
-
-        return super().set_index(
-            other,
-            sorted=pre_sorted,
-            shuffle_method=_get_shuffle_method(shuffle_method),
-            divisions=divisions,
-            **kwargs,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def sort_values(
-        self,
-        by,
-        ignore_index=False,
-        max_branch=None,
-        divisions=None,
-        set_divisions=False,
-        ascending=True,
-        na_position="last",
-        sort_function=None,
-        sort_function_kwargs=None,
-        shuffle_method=None,
-        **kwargs,
-    ):
-        if kwargs:
-            raise ValueError(
-                f"Unsupported input arguments passed : {list(kwargs.keys())}"
-            )
-
-        df = sorting.sort_values(
-            self,
-            by,
-            max_branch=max_branch,
-            divisions=divisions,
-            set_divisions=set_divisions,
-            ignore_index=ignore_index,
-            ascending=ascending,
-            na_position=na_position,
-            shuffle_method=shuffle_method,
-            sort_function=sort_function,
-            sort_function_kwargs=sort_function_kwargs,
-        )
-
-        if ignore_index:
-            return df.reset_index(drop=True)
-        return df
-
-    @_dask_cudf_performance_tracking
-    def to_parquet(self, path, *args, **kwargs):
-        """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
-        from dask_cudf._legacy.io import to_parquet
-
-        return to_parquet(self, path, *args, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def to_orc(self, path, **kwargs):
-        """Calls dask_cudf._legacy.io.to_orc"""
-        from dask_cudf._legacy.io import to_orc
-
-        return to_orc(self, path, **kwargs)
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-        numeric_only=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(
-            axis=axis, skipna=skipna, numeric_only=numeric_only
-        )
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-                numeric_only=numeric_only,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def shuffle(self, *args, shuffle_method=None, **kwargs):
-        """Wraps dask.dataframe DataFrame.shuffle method"""
-        return super().shuffle(
-            *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
-        )
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, by=None, **kwargs):
-        from .groupby import CudfDataFrameGroupBy
-
-        return CudfDataFrameGroupBy(self, by=by, **kwargs)
-
-
-@_dask_cudf_performance_tracking
-def sum_of_squares(x):
-    x = x.astype("f8")._column
-    outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series._from_column(outcol)
-
-
-@_dask_cudf_performance_tracking
-def var_aggregate(x2, x, n, ddof):
-    try:
-        with warnings.catch_warnings(record=True):
-            warnings.simplefilter("always")
-            result = (x2 / n) - (x / n) ** 2
-        if ddof != 0:
-            result = result * n / (n - ddof)
-        return result
-    except ZeroDivisionError:
-        return np.float64(np.nan)
-
-
-@_dask_cudf_performance_tracking
-def nlargest_agg(x, **kwargs):
-    return cudf.concat(x).nlargest(**kwargs)
-
-
-@_dask_cudf_performance_tracking
-def nsmallest_agg(x, **kwargs):
-    return cudf.concat(x).nsmallest(**kwargs)
-
-
-class Series(_Frame, dd.core.Series):
-    _partition_type = cudf.Series
-
-    @_dask_cudf_performance_tracking
-    def count(self, split_every=False):
-        return reduction(
-            [self],
-            chunk=M.count,
-            aggregate=np.sum,
-            split_every=split_every,
-            meta="i8",
-        )
-
-    @_dask_cudf_performance_tracking
-    def mean(self, split_every=False):
-        sum = self.sum(split_every=split_every)
-        n = self.count(split_every=split_every)
-        return sum / n
-
-    @derived_from(pd.DataFrame)
-    @_dask_cudf_performance_tracking
-    def var(
-        self,
-        axis=None,
-        skipna=True,
-        ddof=1,
-        split_every=False,
-        dtype=None,
-        out=None,
-        naive=False,
-    ):
-        axis = self._validate_axis(axis)
-        meta = self._meta_nonempty.var(axis=axis, skipna=skipna)
-        if axis == 1:
-            result = map_partitions(
-                M.var,
-                self,
-                meta=meta,
-                token=self._token_prefix + "var",
-                axis=axis,
-                skipna=skipna,
-                ddof=ddof,
-            )
-            return handle_out(out, result)
-        elif naive:
-            return _naive_var(self, meta, skipna, ddof, split_every, out)
-        else:
-            return _parallel_var(self, meta, skipna, split_every, out)
-
-    @_dask_cudf_performance_tracking
-    def groupby(self, *args, **kwargs):
-        from .groupby import CudfSeriesGroupBy
-
-        return CudfSeriesGroupBy(self, *args, **kwargs)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def list(self):
-        return ListMethods(self)
-
-    @property  # type: ignore
-    @_dask_cudf_performance_tracking
-    def struct(self):
-        return StructMethods(self)
-
-
-class Index(Series, dd.core.Index):
-    _partition_type = cudf.Index  # type: ignore
-
-
-@_dask_cudf_performance_tracking
-def _naive_var(ddf, meta, skipna, ddof, split_every, out):
-    num = ddf._get_numeric_data()
-    x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
-    x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every)
-    n = num.count(split_every=split_every)
-    name = ddf._token_prefix + "var"
-    result = map_partitions(
-        var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof
-    )
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _parallel_var(ddf, meta, skipna, split_every, out):
-    def _local_var(x, skipna):
-        if skipna:
-            n = x.count()
-            avg = x.mean(skipna=skipna)
-        else:
-            # Not skipping nulls, so might as well
-            # avoid the full `count` operation
-            n = len(x)
-            avg = x.sum(skipna=skipna) / n
-        m2 = ((x - avg) ** 2).sum(skipna=skipna)
-        return n, avg, m2
-
-    def _aggregate_var(parts):
-        n, avg, m2 = parts[0]
-        for i in range(1, len(parts)):
-            n_a, avg_a, m2_a = n, avg, m2
-            n_b, avg_b, m2_b = parts[i]
-            n = n_a + n_b
-            avg = (n_a * avg_a + n_b * avg_b) / n
-            delta = avg_b - avg_a
-            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
-        return n, avg, m2
-
-    def _finalize_var(vals):
-        n, _, m2 = vals
-        return m2 / (n - 1)
-
-    # Build graph
-    nparts = ddf.npartitions
-    if not split_every:
-        split_every = nparts
-    name = "var-" + tokenize(skipna, split_every, out)
-    local_name = "local-" + name
-    num = ddf._get_numeric_data()
-    dsk = {
-        (local_name, n, 0): (_local_var, (num._name, n), skipna)
-        for n in range(nparts)
-    }
-
-    # Use reduction tree
-    widths = [nparts]
-    while nparts > 1:
-        nparts = math.ceil(nparts / split_every)
-        widths.append(nparts)
-    height = len(widths)
-    for depth in range(1, height):
-        for group in range(widths[depth]):
-            p_max = widths[depth - 1]
-            lstart = split_every * group
-            lstop = min(lstart + split_every, p_max)
-            node_list = [
-                (local_name, p, depth - 1) for p in range(lstart, lstop)
-            ]
-            dsk[(local_name, group, depth)] = (_aggregate_var, node_list)
-    if height == 1:
-        group = depth = 0
-    dsk[(name, 0)] = (_finalize_var, (local_name, group, depth))
-
-    graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf])
-    result = dd.core.new_dd_object(graph, name, meta, (None, None))
-    if isinstance(ddf, DataFrame):
-        result.divisions = (min(ddf.columns), max(ddf.columns))
-    return handle_out(out, result)
-
-
-@_dask_cudf_performance_tracking
-def _extract_meta(x):
-    """
-    Extract internal cache data (``_meta``) from dask_cudf objects
-    """
-    if isinstance(x, (Scalar, _Frame)):
-        return x._meta
-    elif isinstance(x, list):
-        return [_extract_meta(_x) for _x in x]
-    elif isinstance(x, tuple):
-        return tuple(_extract_meta(_x) for _x in x)
-    elif isinstance(x, dict):
-        return {k: _extract_meta(v) for k, v in x.items()}
-    return x
-
-
-@_dask_cudf_performance_tracking
-def _emulate(func, *args, **kwargs):
-    """
-    Apply a function using args / kwargs. If arguments contain dd.DataFrame /
-    dd.Series, using internal cache (``_meta``) for calculation
-    """
-    with raise_on_meta_error(funcname(func)):
-        return func(*_extract_meta(args), **_extract_meta(kwargs))
-
-
-@_dask_cudf_performance_tracking
-def align_partitions(args):
-    """Align partitions between dask_cudf objects.
-
-    Note that if all divisions are unknown, but have equal npartitions, then
-    they will be passed through unchanged.
-    """
-    dfs = [df for df in args if isinstance(df, _Frame)]
-    if not dfs:
-        return args
-
-    divisions = dfs[0].divisions
-    if not all(df.divisions == divisions for df in dfs):
-        raise NotImplementedError("Aligning mismatched partitions")
-    return args
-
-
-@_dask_cudf_performance_tracking
-def reduction(
-    args,
-    chunk=None,
-    aggregate=None,
-    combine=None,
-    meta=None,
-    token=None,
-    chunk_kwargs=None,
-    aggregate_kwargs=None,
-    combine_kwargs=None,
-    split_every=None,
-    **kwargs,
-):
-    """Generic tree reduction operation.
-
-    Parameters
-    ----------
-    args :
-        Positional arguments for the `chunk` function. All `dask.dataframe`
-        objects should be partitioned and indexed equivalently.
-    chunk : function [block-per-arg] -> block
-        Function to operate on each block of data
-    aggregate : function list-of-blocks -> block
-        Function to operate on the list of results of chunk
-    combine : function list-of-blocks -> block, optional
-        Function to operate on intermediate lists of results of chunk
-        in a tree-reduction. If not provided, defaults to aggregate.
-    $META
-    token : str, optional
-        The name to use for the output keys.
-    chunk_kwargs : dict, optional
-        Keywords for the chunk function only.
-    aggregate_kwargs : dict, optional
-        Keywords for the aggregate function only.
-    combine_kwargs : dict, optional
-        Keywords for the combine function only.
-    split_every : int, optional
-        Group partitions into groups of this size while performing a
-        tree-reduction. If set to False, no tree-reduction will be used,
-        and all intermediates will be concatenated and passed to ``aggregate``.
-        Default is 8.
-    kwargs :
-        All remaining keywords will be passed to ``chunk``, ``aggregate``, and
-        ``combine``.
-    """
-    if chunk_kwargs is None:
-        chunk_kwargs = dict()
-    if aggregate_kwargs is None:
-        aggregate_kwargs = dict()
-    chunk_kwargs.update(kwargs)
-    aggregate_kwargs.update(kwargs)
-
-    if combine is None:
-        if combine_kwargs:
-            raise ValueError("`combine_kwargs` provided with no `combine`")
-        combine = aggregate
-        combine_kwargs = aggregate_kwargs
-    else:
-        if combine_kwargs is None:
-            combine_kwargs = dict()
-        combine_kwargs.update(kwargs)
-
-    if not isinstance(args, (tuple, list)):
-        args = [args]
-
-    npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)}
-    if len(npartitions) > 1:
-        raise ValueError("All arguments must have same number of partitions")
-    npartitions = npartitions.pop()
-
-    if split_every is None:
-        split_every = 8
-    elif split_every is False:
-        split_every = npartitions
-    elif split_every < 2 or not isinstance(split_every, int):
-        raise ValueError("split_every must be an integer >= 2")
-
-    token_key = tokenize(
-        token or (chunk, aggregate),
-        meta,
-        args,
-        chunk_kwargs,
-        aggregate_kwargs,
-        combine_kwargs,
-        split_every,
-    )
-
-    # Chunk
-    a = f"{token or funcname(chunk)}-chunk-{token_key}"
-    if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs:
-        dsk = {
-            (a, 0, i): (chunk, key)
-            for i, key in enumerate(args[0].__dask_keys__())
-        }
-    else:
-        dsk = {
-            (a, 0, i): (
-                apply,
-                chunk,
-                [(x._name, i) if isinstance(x, _Frame) else x for x in args],
-                chunk_kwargs,
-            )
-            for i in range(args[0].npartitions)
-        }
-
-    # Combine
-    b = f"{token or funcname(combine)}-combine-{token_key}"
-    k = npartitions
-    depth = 0
-    while k > split_every:
-        for part_i, inds in enumerate(partition_all(split_every, range(k))):
-            conc = (list, [(a, depth, i) for i in inds])
-            dsk[(b, depth + 1, part_i)] = (
-                (apply, combine, [conc], combine_kwargs)
-                if combine_kwargs
-                else (combine, conc)
-            )
-        k = part_i + 1
-        a = b
-        depth += 1
-
-    # Aggregate
-    b = f"{token or funcname(aggregate)}-agg-{token_key}"
-    conc = (list, [(a, depth, i) for i in range(k)])
-    if aggregate_kwargs:
-        dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs)
-    else:
-        dsk[(b, 0)] = (aggregate, conc)
-
-    if meta is None:
-        meta_chunk = _emulate(apply, chunk, args, chunk_kwargs)
-        meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs)
-    meta = dask_make_meta(meta)
-
-    graph = HighLevelGraph.from_collections(b, dsk, dependencies=args)
-    return dd.core.new_dd_object(graph, b, meta, (None, None))
-
-
-for name in (
-    "add",
-    "sub",
-    "mul",
-    "truediv",
-    "floordiv",
-    "mod",
-    "pow",
-    "radd",
-    "rsub",
-    "rmul",
-    "rtruediv",
-    "rfloordiv",
-    "rmod",
-    "rpow",
-):
-    meth = getattr(cudf.DataFrame, name)
-    DataFrame._bind_operator_method(name, meth, original=cudf.Series)
-
-    meth = getattr(cudf.Series, name)
-    Series._bind_operator_method(name, meth, original=cudf.Series)
-
-for name in ("lt", "gt", "le", "ge", "ne", "eq"):
-    meth = getattr(cudf.Series, name)
-    Series._bind_comparison_method(name, meth, original=cudf.Series)
diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py
deleted file mode 100644
index 7e01e91476d..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/groupby.py
+++ /dev/null
@@ -1,909 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from __future__ import annotations
-
-from functools import wraps
-
-import numpy as np
-import pandas as pd
-
-from dask.dataframe.core import (
-    DataFrame as DaskDataFrame,
-    aca,
-    split_out_on_cols,
-)
-from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy
-from dask.utils import funcname
-
-import cudf
-from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg
-
-# aggregations that are dask-cudf optimized
-OPTIMIZED_AGGS = (
-    "count",
-    "mean",
-    "std",
-    "var",
-    "sum",
-    "min",
-    "max",
-    list,
-    "first",
-    "last",
-)
-
-
-def _check_groupby_optimized(func):
-    """
-    Decorator for dask-cudf's groupby methods that returns the dask-cudf
-    optimized method if the groupby object is supported, otherwise
-    reverting to the upstream Dask method
-    """
-
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        gb = args[0]
-        if _groupby_optimized(gb):
-            return func(*args, **kwargs)
-        # note that we use upstream Dask's default kwargs for this call if
-        # none are specified; this shouldn't be an issue as those defaults are
-        # consistent with dask-cudf
-        return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs)
-
-    return wrapper
-
-
-class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    def __getitem__(self, key):
-        if isinstance(key, list):
-            g = CudfDataFrameGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-        else:
-            g = CudfSeriesGroupBy(
-                self.obj,
-                by=self.by,
-                slice=key,
-                sort=self.sort,
-                **self.dropna,
-            )
-
-        g._meta = g._meta[key]
-        return g
-
-    @_dask_cudf_performance_tracking
-    def _make_groupby_method_aggs(self, agg_name):
-        """Create aggs dictionary for aggregation methods"""
-
-        if isinstance(self.by, list):
-            return {c: agg_name for c in self.obj.columns if c not in self.by}
-        return {c: agg_name for c in self.obj.columns if c != self.by}
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("count"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("mean"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("std"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("var"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("sum"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("min"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("max"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs(list),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("first"),
-            split_every,
-            split_out,
-        )
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            self._make_groupby_method_aggs("last"),
-            split_every,
-            split_out,
-        )
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
-                keys = self._meta.grouping.keys.names
-            else:
-                keys = self._meta.grouping.keys.name
-
-            return groupby_agg(
-                self.obj,
-                keys,
-                arg,
-                split_every=split_every,
-                split_out=split_out,
-                sep=self.sep,
-                sort=self.sort,
-                as_index=self.as_index,
-                shuffle_method=shuffle_method,
-                **self.dropna,
-            )
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_performance_tracking
-    def __init__(self, *args, sort=None, **kwargs):
-        self.sep = kwargs.pop("sep", "___")
-        self.as_index = kwargs.pop("as_index", True)
-        super().__init__(*args, sort=sort, **kwargs)
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def count(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "count"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def mean(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "mean"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def std(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "std"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def var(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "var"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def sum(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "sum"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def min(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "min"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def max(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "max"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def collect(self, split_every=None, split_out=1):
-        _deprecate_collect()
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: list},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def first(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "first"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_dask_cudf_performance_tracking
-    @_check_groupby_optimized
-    def last(self, split_every=None, split_out=1):
-        return _make_groupby_agg_call(
-            self,
-            {self._slice: "last"},
-            split_every,
-            split_out,
-        )[self._slice]
-
-    @_deprecate_shuffle_kwarg
-    @_dask_cudf_performance_tracking
-    def aggregate(
-        self, arg, split_every=None, split_out=1, shuffle_method=None
-    ):
-        if arg == "size":
-            return self.size()
-
-        arg = _redirect_aggs(arg)
-
-        if not isinstance(arg, dict):
-            arg = {self._slice: arg}
-
-        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
-            return _make_groupby_agg_call(
-                self, arg, split_every, split_out, shuffle_method
-            )[self._slice]
-
-        return super().aggregate(
-            arg,
-            split_every=split_every,
-            split_out=split_out,
-            shuffle_method=shuffle_method,
-        )
-
-
-def _shuffle_aggregate(
-    ddf,
-    gb_cols,
-    chunk,
-    chunk_kwargs,
-    aggregate,
-    aggregate_kwargs,
-    split_every,
-    split_out,
-    token=None,
-    sort=None,
-    shuffle_method=None,
-):
-    # Shuffle-based groupby aggregation
-    # NOTE: This function is the dask_cudf version of
-    # dask.dataframe.groupby._shuffle_aggregate
-
-    # Step 1 - Chunkwise groupby operation
-    chunk_name = f"{token or funcname(chunk)}-chunk"
-    chunked = ddf.map_partitions(
-        chunk,
-        meta=chunk(ddf._meta, **chunk_kwargs),
-        token=chunk_name,
-        **chunk_kwargs,
-    )
-
-    # Step 2 - Perform global sort or shuffle
-    shuffle_npartitions = max(
-        chunked.npartitions // split_every,
-        split_out,
-    )
-    if sort and split_out > 1:
-        # Sort-based code path
-        result = (
-            chunked.repartition(npartitions=shuffle_npartitions)
-            .sort_values(
-                gb_cols,
-                ignore_index=True,
-                shuffle_method=shuffle_method,
-            )
-            .map_partitions(
-                aggregate,
-                meta=aggregate(chunked._meta, **aggregate_kwargs),
-                **aggregate_kwargs,
-            )
-        )
-    else:
-        # Hash-based code path
-        result = chunked.shuffle(
-            gb_cols,
-            npartitions=shuffle_npartitions,
-            ignore_index=True,
-            shuffle_method=shuffle_method,
-        ).map_partitions(
-            aggregate,
-            meta=aggregate(chunked._meta, **aggregate_kwargs),
-            **aggregate_kwargs,
-        )
-
-    # Step 3 - Repartition and return
-    if split_out < result.npartitions:
-        return result.repartition(npartitions=split_out)
-    return result
-
-
-@_dask_cudf_performance_tracking
-def groupby_agg(
-    ddf,
-    gb_cols,
-    aggs_in,
-    split_every=None,
-    split_out=None,
-    dropna=True,
-    sep="___",
-    sort=False,
-    as_index=True,
-    shuffle_method=None,
-):
-    """Optimized groupby aggregation for Dask-CuDF.
-
-    Parameters
-    ----------
-    ddf : DataFrame
-        DataFrame object to perform grouping on.
-    gb_cols : str or list[str]
-        Column names to group by.
-    aggs_in : str, list, or dict
-        Aggregations to perform.
-    split_every : int (optional)
-        How to group intermediate aggregates.
-    dropna : bool
-        Drop grouping key values corresponding to NA values.
-    as_index : bool
-        Currently ignored.
-    sort : bool
-        Sort the group keys, better performance is obtained when
-        not sorting.
-    shuffle_method : str (optional)
-        Control how shuffling of the DataFrame is performed.
-    sep : str
-        Internal usage.
-
-
-    Notes
-    -----
-    This "optimized" approach is more performant than the algorithm in
-    implemented in :meth:`DataFrame.apply` because it allows the cuDF
-    backend to perform multiple aggregations at once.
-
-    This aggregation algorithm only supports the following options
-
-    * "list"
-    * "count"
-    * "first"
-    * "last"
-    * "max"
-    * "mean"
-    * "min"
-    * "std"
-    * "sum"
-    * "var"
-
-
-    See Also
-    --------
-    DataFrame.groupby : generic groupby of a DataFrame
-    dask.dataframe.apply_concat_apply : for more description of the
-        split_every argument.
-
-    """
-    # Assert that aggregations are supported
-    aggs = _redirect_aggs(aggs_in)
-    if not _aggs_optimized(aggs, OPTIMIZED_AGGS):
-        raise ValueError(
-            f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. "
-            f"Aggregations must be specified with dict or list syntax."
-        )
-
-    # If split_every is False, we use an all-to-one reduction
-    if split_every is False:
-        split_every = max(ddf.npartitions, 2)
-
-    # Deal with default split_out and split_every params
-    split_every = split_every or 8
-    split_out = split_out or 1
-
-    # Standardize `gb_cols`, `columns`, and `aggs`
-    if isinstance(gb_cols, str):
-        gb_cols = [gb_cols]
-    columns = [c for c in ddf.columns if c not in gb_cols]
-    if not isinstance(aggs, dict):
-        aggs = {col: aggs for col in columns}
-
-    # Assert if our output will have a MultiIndex; this will be the case if
-    # any value in the `aggs` dict is not a string (i.e. multiple/named
-    # aggregations per column)
-    str_cols_out = True
-    aggs_renames = {}
-    for col in aggs:
-        if isinstance(aggs[col], str) or callable(aggs[col]):
-            aggs[col] = [aggs[col]]
-        elif isinstance(aggs[col], dict):
-            str_cols_out = False
-            col_aggs = []
-            for k, v in aggs[col].items():
-                aggs_renames[col, v] = k
-                col_aggs.append(v)
-            aggs[col] = col_aggs
-        else:
-            str_cols_out = False
-        if col in gb_cols:
-            columns.append(col)
-
-    # Construct meta
-    _aggs = aggs.copy()
-    if str_cols_out:
-        # Metadata should use `str` for dict values if that is
-        # what the user originally specified (column names will
-        # be str, rather than tuples).
-        for col in aggs:
-            _aggs[col] = _aggs[col][0]
-    _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs)
-    if aggs_renames:
-        col_array = []
-        agg_array = []
-        for col, agg in _meta.columns:
-            col_array.append(col)
-            agg_array.append(aggs_renames.get((col, agg), agg))
-        _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    chunk = _groupby_partition_agg
-    chunk_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    combine = _tree_node_agg
-    combine_kwargs = {
-        "gb_cols": gb_cols,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-    }
-
-    aggregate = _finalize_gb_agg
-    aggregate_kwargs = {
-        "gb_cols": gb_cols,
-        "aggs": aggs,
-        "columns": columns,
-        "final_columns": _meta.columns,
-        "as_index": as_index,
-        "dropna": dropna,
-        "sort": sort,
-        "sep": sep,
-        "str_cols_out": str_cols_out,
-        "aggs_renames": aggs_renames,
-    }
-
-    # Use shuffle_method=True for split_out>1
-    if sort and split_out > 1 and shuffle_method is None:
-        shuffle_method = "tasks"
-
-    # Check if we are using the shuffle-based algorithm
-    if shuffle_method:
-        # Shuffle-based aggregation
-        return _shuffle_aggregate(
-            ddf,
-            gb_cols,
-            chunk,
-            chunk_kwargs,
-            aggregate,
-            aggregate_kwargs,
-            split_every,
-            split_out,
-            token="cudf-aggregate",
-            sort=sort,
-            shuffle_method=shuffle_method
-            if isinstance(shuffle_method, str)
-            else None,
-        )
-
-    # Deal with sort/shuffle defaults
-    if split_out > 1 and sort:
-        raise ValueError(
-            "dask-cudf's groupby algorithm does not yet support "
-            "`sort=True` when `split_out>1`, unless a shuffle-based "
-            "algorithm is used. Please use `split_out=1`, group "
-            "with `sort=False`, or set `shuffle_method=True`."
-        )
-
-    # Determine required columns to enable column projection
-    required_columns = list(
-        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
-    )
-
-    return aca(
-        [ddf[required_columns]],
-        chunk=chunk,
-        chunk_kwargs=chunk_kwargs,
-        combine=combine,
-        combine_kwargs=combine_kwargs,
-        aggregate=aggregate,
-        aggregate_kwargs=aggregate_kwargs,
-        token="cudf-aggregate",
-        split_every=split_every,
-        split_out=split_out,
-        split_out_setup=split_out_on_cols,
-        split_out_setup_kwargs={"cols": gb_cols},
-        sort=sort,
-        ignore_index=True,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _make_groupby_agg_call(
-    gb, aggs, split_every, split_out, shuffle_method=None
-):
-    """Helper method to consolidate the common `groupby_agg` call for all
-    aggregations in one place
-    """
-
-    return groupby_agg(
-        gb.obj,
-        gb.by,
-        aggs,
-        split_every=split_every,
-        split_out=split_out,
-        sep=gb.sep,
-        sort=gb.sort,
-        as_index=gb.as_index,
-        shuffle_method=shuffle_method,
-        **gb.dropna,
-    )
-
-
-@_dask_cudf_performance_tracking
-def _redirect_aggs(arg):
-    """Redirect aggregations to their corresponding name in cuDF"""
-    redirects = {
-        sum: "sum",
-        max: "max",
-        min: "min",
-        "collect": list,
-        "list": list,
-    }
-    if isinstance(arg, dict):
-        new_arg = dict()
-        for col in arg:
-            if isinstance(arg[col], list):
-                new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]]
-            elif isinstance(arg[col], dict):
-                new_arg[col] = {
-                    k: redirects.get(v, v) for k, v in arg[col].items()
-                }
-            else:
-                new_arg[col] = redirects.get(arg[col], arg[col])
-        return new_arg
-    if isinstance(arg, list):
-        return [redirects.get(agg, agg) for agg in arg]
-    return redirects.get(arg, arg)
-
-
-@_dask_cudf_performance_tracking
-def _aggs_optimized(arg, supported: set):
-    """Check that aggregations in `arg` are a subset of `supported`"""
-    if isinstance(arg, (list, dict)):
-        if isinstance(arg, dict):
-            _global_set: set[str] = set()
-            for col in arg:
-                if isinstance(arg[col], list):
-                    _global_set = _global_set.union(set(arg[col]))
-                elif isinstance(arg[col], dict):
-                    _global_set = _global_set.union(set(arg[col].values()))
-                else:
-                    _global_set.add(arg[col])
-        else:
-            _global_set = set(arg)
-
-        return bool(_global_set.issubset(supported))
-    elif isinstance(arg, (str, type)):
-        return arg in supported
-    return False
-
-
-@_dask_cudf_performance_tracking
-def _groupby_optimized(gb):
-    """Check that groupby input can use dask-cudf optimized codepath"""
-    return isinstance(gb.obj, DaskDataFrame) and (
-        isinstance(gb.by, str)
-        or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
-    )
-
-
-def _make_name(col_name, sep="_"):
-    """Combine elements of `col_name` into a single string, or no-op if
-    `col_name` is already a string
-    """
-    if isinstance(col_name, str):
-        return col_name
-    return sep.join(name for name in col_name if name != "")
-
-
-@_dask_cudf_performance_tracking
-def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
-    """Initial partition-level aggregation task.
-
-    This is the first operation to be executed on each input
-    partition in `groupby_agg`.  Depending on `aggs`, four possible
-    groupby aggregations ("count", "sum", "min", and "max") are
-    performed.  The result is then partitioned (by hashing `gb_cols`)
-    into a number of distinct dictionary elements.  The number of
-    elements in the output dictionary (`split_out`) corresponds to
-    the number of partitions in the final output of `groupby_agg`.
-    """
-
-    # Modify dict for initial (partition-wise) aggregations
-    _agg_dict = {}
-    for col, agg_list in aggs.items():
-        _agg_dict[col] = set()
-        for agg in agg_list:
-            if agg in ("mean", "std", "var"):
-                _agg_dict[col].add("count")
-                _agg_dict[col].add("sum")
-            else:
-                _agg_dict[col].add(agg)
-        _agg_dict[col] = list(_agg_dict[col])
-        if set(agg_list).intersection({"std", "var"}):
-            pow2_name = _make_name((col, "pow2"), sep=sep)
-            df[pow2_name] = df[col].astype("float64").pow(2)
-            _agg_dict[pow2_name] = ["sum"]
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        _agg_dict
-    )
-    output_columns = [_make_name(name, sep=sep) for name in gb.columns]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _tree_node_agg(df, gb_cols, dropna, sort, sep):
-    """Node in groupby-aggregation reduction tree.
-
-    The input DataFrame (`df`) corresponds to the
-    concatenated output of one or more `_groupby_partition_agg`
-    tasks. In this function, "sum", "min" and/or "max" groupby
-    aggregations will be used to combine the statistics for
-    duplicate keys.
-    """
-
-    agg_dict = {}
-    for col in df.columns:
-        if col in gb_cols:
-            continue
-        agg = col.split(sep)[-1]
-        if agg in ("count", "sum"):
-            agg_dict[col] = ["sum"]
-        elif agg == "list":
-            agg_dict[col] = [list]
-        elif agg in OPTIMIZED_AGGS:
-            agg_dict[col] = [agg]
-        else:
-            raise ValueError(f"Unexpected aggregation: {agg}")
-
-    gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg(
-        agg_dict
-    )
-
-    # Don't include the last aggregation in the column names
-    output_columns = [
-        _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep)
-        for name in gb.columns
-    ]
-    gb.columns = output_columns
-    # Return with deterministic column ordering
-    return gb[sorted(output_columns)]
-
-
-@_dask_cudf_performance_tracking
-def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
-    """Calculate variance (given count, sum, and sum-squared columns)."""
-
-    # Select count, sum, and sum-squared
-    n = df[count_name]
-    x = df[sum_name]
-    x2 = df[pow2_sum_name]
-
-    # Use sum-squared approach to get variance
-    var = x2 - x**2 / n
-    div = n - ddof
-    div[div < 1] = 1  # Avoid division by 0
-    var /= div
-
-    # Set appropriate NaN elements
-    # (since we avoided 0-division)
-    var[(n - ddof) == 0] = np.nan
-
-    return var
-
-
-@_dask_cudf_performance_tracking
-def _finalize_gb_agg(
-    gb_in,
-    gb_cols,
-    aggs,
-    columns,
-    final_columns,
-    as_index,
-    dropna,
-    sort,
-    sep,
-    str_cols_out,
-    aggs_renames,
-):
-    """Final aggregation task.
-
-    This is the final operation on each output partitions
-    of the `groupby_agg` algorithm.  This function must
-    take care of higher-order aggregations, like "mean",
-    "std" and "var".  We also need to deal with the column
-    index, the row index, and final sorting behavior.
-    """
-
-    gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep)
-
-    # Deal with higher-order aggregations
-    for col in columns:
-        agg_list = aggs.get(col, [])
-        agg_set = set(agg_list)
-        if agg_set.intersection({"mean", "std", "var"}):
-            count_name = _make_name((col, "count"), sep=sep)
-            sum_name = _make_name((col, "sum"), sep=sep)
-            if agg_set.intersection({"std", "var"}):
-                pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep)
-                var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name)
-                if "var" in agg_list:
-                    name_var = _make_name((col, "var"), sep=sep)
-                    gb[name_var] = var
-                if "std" in agg_list:
-                    name_std = _make_name((col, "std"), sep=sep)
-                    gb[name_std] = np.sqrt(var)
-                gb.drop(columns=[pow2_sum_name], inplace=True)
-            if "mean" in agg_list:
-                mean_name = _make_name((col, "mean"), sep=sep)
-                gb[mean_name] = gb[sum_name] / gb[count_name]
-            if "sum" not in agg_list:
-                gb.drop(columns=[sum_name], inplace=True)
-            if "count" not in agg_list:
-                gb.drop(columns=[count_name], inplace=True)
-        if list in agg_list:
-            collect_name = _make_name((col, "list"), sep=sep)
-            gb[collect_name] = gb[collect_name].list.concat()
-
-    # Ensure sorted keys if `sort=True`
-    if sort:
-        gb = gb.sort_values(gb_cols)
-
-    # Set index if necessary
-    if as_index:
-        gb.set_index(gb_cols, inplace=True)
-
-    # Unflatten column names
-    col_array = []
-    agg_array = []
-    for col in gb.columns:
-        if col in gb_cols:
-            col_array.append(col)
-            agg_array.append("")
-        else:
-            name, agg = col.split(sep)
-            col_array.append(name)
-            agg_array.append(aggs_renames.get((name, agg), agg))
-    if str_cols_out:
-        gb.columns = col_array
-    else:
-        gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array])
-
-    return gb[final_columns]
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
index 0421bd755f4..c544c32523f 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py
@@ -1,11 +1 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-from .csv import read_csv  # noqa: F401
-from .json import read_json  # noqa: F401
-from .orc import read_orc, to_orc  # noqa: F401
-from .text import read_text  # noqa: F401
-
-try:
-    from .parquet import read_parquet, to_parquet  # noqa: F401
-except ImportError:
-    pass
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py
deleted file mode 100644
index fa5400344f9..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-from warnings import warn
-
-from fsspec.utils import infer_compression
-
-from dask import dataframe as dd
-from dask.base import tokenize
-from dask.dataframe.io.csv import make_reader
-from dask.utils import apply, parse_bytes
-
-import cudf
-
-
-def read_csv(path, blocksize="default", **kwargs):
-    """
-    Read CSV files into a :class:`.DataFrame`.
-
-    This API parallelizes the :func:`cudf:cudf.read_csv` function in
-    the following ways:
-
-    It supports loading many files at once using globstrings:
-
-    >>> import dask_cudf
-    >>> df = dask_cudf.read_csv("myfiles.*.csv")
-
-    In some cases it can break up large files:
-
-    >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
-
-    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
-
-    >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
-    >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
-
-    Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and
-    supports many of the same keyword arguments with the same
-    performance guarantees. See the docstring for
-    :func:`cudf:cudf.read_csv` for more information on available
-    keyword arguments.
-
-    Parameters
-    ----------
-    path : str, path object, or file-like object
-        Either a path to a file (a str, :py:class:`pathlib.Path`, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3
-        locations), or any object with a read() method (such as
-        builtin :py:func:`open` file handler function or
-        :py:class:`~io.StringIO`).
-    blocksize : int or str, default "256 MiB"
-        The target task partition size. If ``None``, a single block
-        is used for each file.
-    **kwargs : dict
-        Passthrough key-word arguments that are sent to
-        :func:`cudf:cudf.read_csv`.
-
-    Notes
-    -----
-    If any of `skipfooter`/`skiprows`/`nrows` are passed,
-    `blocksize` will default to None.
-
-    Examples
-    --------
-    >>> import dask_cudf
-    >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"])
-    >>> ddf.compute()
-       a      b
-    0  1     hi
-    1  2  hello
-    2  3     ai
-
-    """
-
-    # Handle `chunksize` deprecation
-    if "chunksize" in kwargs:
-        chunksize = kwargs.pop("chunksize", "default")
-        warn(
-            "`chunksize` is deprecated and will be removed in the future. "
-            "Please use `blocksize` instead.",
-            FutureWarning,
-        )
-        if blocksize == "default":
-            blocksize = chunksize
-
-    # Set default `blocksize`
-    if blocksize == "default":
-        if (
-            kwargs.get("skipfooter", 0) != 0
-            or kwargs.get("skiprows", 0) != 0
-            or kwargs.get("nrows", None) is not None
-        ):
-            # Cannot read in blocks if skipfooter,
-            # skiprows or nrows is passed.
-            blocksize = None
-        else:
-            blocksize = "256 MiB"
-
-    if "://" in str(path):
-        func = make_reader(cudf.read_csv, "read_csv", "CSV")
-        return func(path, blocksize=blocksize, **kwargs)
-    else:
-        return _internal_read_csv(path=path, blocksize=blocksize, **kwargs)
-
-
-def _internal_read_csv(path, blocksize="256 MiB", **kwargs):
-    if isinstance(blocksize, str):
-        blocksize = parse_bytes(blocksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    name = "read-csv-" + tokenize(
-        path, tokenize, **kwargs
-    )  # TODO: get last modified time
-
-    compression = kwargs.get("compression", "infer")
-
-    if compression == "infer":
-        # Infer compression from first path by default
-        compression = infer_compression(filenames[0])
-
-    if compression and blocksize:
-        # compressed CSVs reading must read the entire file
-        kwargs.pop("byte_range", None)
-        warn(
-            "Warning %s compression does not support breaking apart files\n"
-            "Please ensure that each individual file can fit in memory and\n"
-            "use the keyword ``blocksize=None to remove this message``\n"
-            "Setting ``blocksize=(size of file)``" % compression
-        )
-        blocksize = None
-
-    if blocksize is None:
-        return read_csv_without_blocksize(path, **kwargs)
-
-    # Let dask.dataframe generate meta
-    dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV")
-    kwargs1 = kwargs.copy()
-    usecols = kwargs1.pop("usecols", None)
-    dtype = kwargs1.pop("dtype", None)
-    meta = dask_reader(filenames[0], **kwargs1)._meta
-    names = meta.columns
-    if usecols or dtype:
-        # Regenerate meta with original kwargs if
-        # `usecols` or `dtype` was specified
-        meta = dask_reader(filenames[0], **kwargs)._meta
-
-    dsk = {}
-    i = 0
-    dtypes = meta.dtypes.values
-
-    for fn in filenames:
-        size = os.path.getsize(fn)
-        for start in range(0, size, blocksize):
-            kwargs2 = kwargs.copy()
-            kwargs2["byte_range"] = (
-                start,
-                blocksize,
-            )  # specify which chunk of the file we care about
-            if start != 0:
-                kwargs2["names"] = names  # no header in the middle of the file
-                kwargs2["header"] = None
-            dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2)
-
-            i += 1
-
-    divisions = [None] * (len(dsk) + 1)
-    return dd.core.new_dd_object(dsk, name, meta, divisions)
-
-
-def _read_csv(fn, dtypes=None, **kwargs):
-    return cudf.read_csv(fn, **kwargs)
-
-
-def read_csv_without_blocksize(path, **kwargs):
-    """Read entire CSV with optional compression (gzip/zip)
-
-    Parameters
-    ----------
-    path : str
-        path to files (support for glob)
-    """
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    name = "read-csv-" + tokenize(path, **kwargs)
-
-    meta_kwargs = kwargs.copy()
-    if "skipfooter" in meta_kwargs:
-        meta_kwargs.pop("skipfooter")
-    if "nrows" in meta_kwargs:
-        meta_kwargs.pop("nrows")
-    # Read "head" of first file (first 5 rows).
-    # Convert to empty df for metadata.
-    meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0]
-
-    graph = {
-        (name, i): (apply, cudf.read_csv, [fn], kwargs)
-        for i, fn in enumerate(filenames)
-    }
-
-    divisions = [None] * (len(filenames) + 1)
-
-    return dd.core.new_dd_object(graph, name, meta, divisions)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py
deleted file mode 100644
index 98c5ceedb76..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/json.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
-
-from functools import partial
-
-import numpy as np
-from fsspec.core import get_compression, get_fs_token_paths
-
-import dask
-from dask.utils import parse_bytes
-
-import cudf
-from cudf.core.column import as_column
-from cudf.utils.ioutils import _is_local_filesystem
-
-from dask_cudf.backends import _default_backend
-
-
-def _read_json_partition(
-    paths,
-    fs=None,
-    include_path_column=False,
-    path_converter=None,
-    **kwargs,
-):
-    # Transfer all data up front for remote storage
-    sources = (
-        paths
-        if fs is None
-        else fs.cat_ranges(
-            paths,
-            [0] * len(paths),
-            fs.sizes(paths),
-        )
-    )
-
-    if include_path_column:
-        # Add "path" column.
-        # Must iterate over sources sequentially
-        if not isinstance(include_path_column, str):
-            include_path_column = "path"
-        converted_paths = (
-            paths
-            if path_converter is None
-            else [path_converter(path) for path in paths]
-        )
-        dfs = []
-        for i, source in enumerate(sources):
-            df = cudf.read_json(source, **kwargs)
-            df[include_path_column] = as_column(
-                converted_paths[i], length=len(df)
-            )
-            dfs.append(df)
-        return cudf.concat(dfs)
-    else:
-        # Pass sources directly to cudf
-        return cudf.read_json(sources, **kwargs)
-
-
-def read_json(
-    url_path,
-    engine="auto",
-    blocksize=None,
-    orient="records",
-    lines=None,
-    compression="infer",
-    aggregate_files=True,
-    **kwargs,
-):
-    """Read JSON data into a :class:`.DataFrame`.
-
-    This function wraps :func:`dask.dataframe.read_json`, and passes
-    ``engine=partial(cudf.read_json, engine="auto")`` by default.
-
-    Parameters
-    ----------
-    url_path : str, list of str
-        Location to read from. If a string, can include a glob character to
-        find a set of file names.
-        Supports protocol specifications such as ``"s3://"``.
-    engine : str or Callable, default "auto"
-
-        If str, this value will be used as the ``engine`` argument
-        when :func:`cudf.read_json` is used to create each partition.
-        If a :obj:`~collections.abc.Callable`, this value will be used as the
-        underlying function used to create each partition from JSON
-        data. The default value is "auto", so that
-        ``engine=partial(cudf.read_json, engine="auto")`` will be
-        passed to :func:`dask.dataframe.read_json` by default.
-    aggregate_files : bool or int
-        Whether to map multiple files to each output partition. If True,
-        the `blocksize` argument will be used to determine the number of
-        files in each partition. If any one file is larger than `blocksize`,
-        the `aggregate_files` argument will be ignored. If an integer value
-        is specified, the `blocksize` argument will be ignored, and that
-        number of files will be mapped to each partition. Default is True.
-    **kwargs :
-        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
-
-    Returns
-    -------
-    :class:`.DataFrame`
-
-    Examples
-    --------
-    Load single file
-
-    >>> from dask_cudf import read_json
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    Load large line-delimited JSON files using partitions of approx
-    256MB size
-
-    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
-
-    Load nested JSON data
-
-    >>> read_json('myfile.json')  # doctest: +SKIP
-
-    See Also
-    --------
-    dask.dataframe.read_json
-
-    """
-
-    if lines is None:
-        lines = orient == "records"
-    if orient != "records" and lines:
-        raise ValueError(
-            'Line-delimited JSON is only available with orient="records".'
-        )
-    if blocksize and (orient != "records" or not lines):
-        raise ValueError(
-            "JSON file chunking only allowed for JSON-lines"
-            "input (orient='records', lines=True)."
-        )
-
-    inputs = []
-    if aggregate_files and blocksize or int(aggregate_files) > 1:
-        # Attempt custom read if we are mapping multiple files
-        # to each output partition. Otherwise, upstream logic
-        # is sufficient.
-
-        storage_options = kwargs.get("storage_options", {})
-        fs, _, paths = get_fs_token_paths(
-            url_path, mode="rb", storage_options=storage_options
-        )
-        if isinstance(aggregate_files, int) and aggregate_files > 1:
-            # Map a static file count to each partition
-            inputs = [
-                paths[offset : offset + aggregate_files]
-                for offset in range(0, len(paths), aggregate_files)
-            ]
-        elif aggregate_files is True and blocksize:
-            # Map files dynamically (using blocksize)
-            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
-            blocksize = parse_bytes(blocksize)
-            if all([file_size <= blocksize for file_size in file_sizes]):
-                counts = np.unique(
-                    np.floor(np.cumsum(file_sizes) / blocksize),
-                    return_counts=True,
-                )[1]
-                offsets = np.concatenate([[0], counts.cumsum()])
-                inputs = [
-                    paths[offsets[i] : offsets[i + 1]]
-                    for i in range(len(offsets) - 1)
-                ]
-
-    if inputs:
-        # Inputs were successfully populated.
-        # Use custom _read_json_partition function
-        # to generate each partition.
-
-        compression = get_compression(
-            url_path[0] if isinstance(url_path, list) else url_path,
-            compression,
-        )
-        _kwargs = dict(
-            orient=orient,
-            lines=lines,
-            compression=compression,
-            include_path_column=kwargs.get("include_path_column", False),
-            path_converter=kwargs.get("path_converter"),
-        )
-        if not _is_local_filesystem(fs):
-            _kwargs["fs"] = fs
-        # TODO: Generate meta more efficiently
-        meta = _read_json_partition(inputs[0][:1], **_kwargs)
-        return dask.dataframe.from_map(
-            _read_json_partition,
-            inputs,
-            meta=meta,
-            **_kwargs,
-        )
-
-    # Fall back to dask.dataframe.read_json
-    return _default_backend(
-        dask.dataframe.read_json,
-        url_path,
-        engine=(
-            partial(cudf.read_json, engine=engine)
-            if isinstance(engine, str)
-            else engine
-        ),
-        blocksize=blocksize,
-        orient=orient,
-        lines=lines,
-        compression=compression,
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py
deleted file mode 100644
index fcf684fd6c8..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from io import BufferedWriter, IOBase
-
-from fsspec.core import get_fs_token_paths
-from fsspec.utils import stringify_path
-from pyarrow import orc as orc
-
-from dask import dataframe as dd
-from dask.dataframe.io.utils import _get_pyarrow_dtypes
-
-import cudf
-
-
-def _read_orc_stripe(source, fs, columns=None, kwargs=None):
-    """Pull out specific columns from specific stripe"""
-    path, stripe = source
-    if kwargs is None:
-        kwargs = {}
-    with fs.open(path, "rb") as f:
-        df_stripe = cudf.read_orc(
-            f, stripes=[stripe], columns=columns, **kwargs
-        )
-    return df_stripe
-
-
-def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
-    """Read ORC files into a :class:`.DataFrame`.
-
-    Note that this function is mostly borrowed from upstream Dask.
-
-    Parameters
-    ----------
-    path : str or list[str]
-        Location of file(s), which can be a full URL with protocol specifier,
-        and may include glob character if a single string.
-    columns : None or list[str]
-        Columns to load. If None, loads all.
-    filters : None or list of tuple or list of lists of tuples
-        If not None, specifies a filter predicate used to filter out
-        row groups using statistics stored for each row group as
-        Parquet metadata. Row groups that do not match the given
-        filter predicate are not read. The predicate is expressed in
-        `disjunctive normal form (DNF)
-        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
-        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
-        boolean logical combinations of single column predicates. The
-        innermost tuples each describe a single column predicate. The
-        list of inner predicates is interpreted as a conjunction
-        (AND), forming a more selective and multiple column predicate.
-        Finally, the outermost list combines these filters as a
-        disjunction (OR). Predicates may also be passed as a list of
-        tuples. This form is interpreted as a single conjunction. To
-        express OR in predicates, one must use the (preferred)
-        notation of list of lists of tuples.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-
-    See Also
-    --------
-    dask.dataframe.read_orc
-
-    Returns
-    -------
-    dask_cudf.DataFrame
-
-    """
-
-    storage_options = storage_options or {}
-    fs, _, paths = get_fs_token_paths(
-        path, mode="rb", storage_options=storage_options
-    )
-    schema = None
-    nstripes_per_file = []
-    for path in paths:
-        with fs.open(path, "rb") as f:
-            o = orc.ORCFile(f)
-            if schema is None:
-                schema = o.schema
-            elif schema != o.schema:
-                raise ValueError(
-                    "Incompatible schemas while parsing ORC files"
-                )
-            nstripes_per_file.append(o.nstripes)
-    schema = _get_pyarrow_dtypes(schema, categories=None)
-    if columns is not None:
-        ex = set(columns) - set(schema)
-        if ex:
-            raise ValueError(
-                f"Requested columns ({ex}) not in schema ({set(schema)})"
-            )
-    else:
-        columns = list(schema)
-
-    with fs.open(paths[0], "rb") as f:
-        meta = cudf.read_orc(
-            f,
-            stripes=[0] if nstripes_per_file[0] else None,
-            columns=columns,
-            **kwargs,
-        )
-
-    sources = []
-    for path, n in zip(paths, nstripes_per_file):
-        for stripe in (
-            range(n)
-            if filters is None
-            else cudf.io.orc._filter_stripes(filters, path)
-        ):
-            sources.append((path, stripe))
-
-    return dd.from_map(
-        _read_orc_stripe,
-        sources,
-        args=[fs],
-        columns=columns,
-        kwargs=kwargs,
-        meta=meta,
-    )
-
-
-def write_orc_partition(df, path, fs, filename, compression="snappy"):
-    full_path = fs.sep.join([path, filename])
-    with fs.open(full_path, mode="wb") as out_file:
-        if not isinstance(out_file, IOBase):
-            out_file = BufferedWriter(out_file)
-        cudf.io.to_orc(df, out_file, compression=compression)
-    return full_path
-
-
-def to_orc(
-    df,
-    path,
-    write_index=True,
-    storage_options=None,
-    compression="snappy",
-    compute=True,
-    **kwargs,
-):
-    """
-    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
-
-    Parameters
-    ----------
-    df : DataFrame
-    path : str or pathlib.Path
-        Destination directory for data.  Prepend with protocol like ``s3://``
-        or ``hdfs://`` for remote data.
-    write_index : boolean, optional
-        Whether or not to write the index. Defaults to True.
-    storage_options : None or dict
-        Further parameters to pass to the bytes backend.
-    compression : string or dict, optional
-    compute : bool, optional
-        If True (default) then the result is computed immediately. If
-        False then a :class:`~dask.delayed.Delayed` object is returned
-        for future computation.
-
-    """
-
-    from dask import compute as dask_compute, delayed
-
-    # TODO: Use upstream dask implementation once available
-    #       (see: Dask Issue#5596)
-
-    if hasattr(path, "name"):
-        path = stringify_path(path)
-    fs, _, _ = get_fs_token_paths(
-        path, mode="wb", storage_options=storage_options
-    )
-    # Trim any protocol information from the path before forwarding
-    path = fs._strip_protocol(path)
-
-    if write_index:
-        df = df.reset_index()
-    else:
-        # Not writing index - might as well drop it
-        df = df.reset_index(drop=True)
-
-    fs.mkdirs(path, exist_ok=True)
-
-    # Use i_offset and df.npartitions to define file-name list
-    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
-
-    # write parts
-    dwrite = delayed(write_orc_partition)
-    parts = [
-        dwrite(d, path, fs, filename, compression=compression)
-        for d, filename in zip(df.to_delayed(), filenames)
-    ]
-
-    if compute:
-        return dask_compute(*parts)
-
-    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0638e4a1c3..c0792663c7e 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 import itertools
 import warnings
 from functools import partial
@@ -8,7 +8,7 @@
 import pandas as pd
 from pyarrow import dataset as pa_ds, parquet as pq
 
-from dask import dataframe as dd
+import dask.dataframe as dd
 from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine
 
 try:
@@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema):
                 df._data[col_name] = col.astype(typ)
 
 
-def read_parquet(path, columns=None, **kwargs):
-    """
-    Read parquet files into a :class:`.DataFrame`.
-
-    Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine``
-    to coordinate the execution of :func:`cudf.read_parquet`, and to
-    ultimately create a :class:`.DataFrame` collection.
-
-    See the :func:`dask.dataframe.read_parquet` documentation for
-    all available options.
-
-    Examples
-    --------
-    >>> from dask_cudf import read_parquet
-    >>> df = read_parquet("/path/to/dataset/")  # doctest: +SKIP
-
-    When dealing with one or more large parquet files having an
-    in-memory footprint >15% device memory, the ``split_row_groups``
-    argument should be used to map Parquet **row-groups** to DataFrame
-    partitions (instead of **files** to partitions). For example, the
-    following code will map each row-group to a distinct partition:
-
-    >>> df = read_parquet(..., split_row_groups=True)  # doctest: +SKIP
-
-    To map **multiple** row-groups to each partition, an integer can be
-    passed to ``split_row_groups`` to specify the **maximum** number of
-    row-groups allowed in each output partition:
-
-    >>> df = read_parquet(..., split_row_groups=10)  # doctest: +SKIP
-
-    See Also
-    --------
-    cudf.read_parquet
-    dask.dataframe.read_parquet
-    """
-    if isinstance(columns, str):
-        columns = [columns]
-
-    # Set "check_file_size" option to determine whether we
-    # should check the parquet-file size. This check is meant
-    # to "protect" users from `split_row_groups` default changes
-    check_file_size = kwargs.pop("check_file_size", 500_000_000)
-    if (
-        check_file_size
-        and ("split_row_groups" not in kwargs)
-        and ("chunksize" not in kwargs)
-    ):
-        # User is not specifying `split_row_groups` or `chunksize`,
-        # so we should warn them if/when a file is ~>0.5GB on disk.
-        # They can set `split_row_groups` explicitly to silence/skip
-        # this check
-        if "read" not in kwargs:
-            kwargs["read"] = {}
-        kwargs["read"]["check_file_size"] = check_file_size
-
-    return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs)
-
-
-to_parquet = partial(dd.to_parquet, engine=CudfEngine)
+to_parquet = dd.to_parquet
 
 if create_metadata_file_dd is None:
     create_metadata_file = create_metadata_file_dd
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py
deleted file mode 100644
index 3757c85c80c..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/io/text.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-import os
-from glob import glob
-
-import dask.dataframe as dd
-from dask.utils import parse_bytes
-
-import cudf
-
-
-def _read_text(source, **kwargs):
-    # Wrapper for cudf.read_text operation
-    fn, byte_range = source
-    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
-
-
-def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
-    if isinstance(chunksize, str):
-        chunksize = parse_bytes(chunksize)
-
-    if isinstance(path, list):
-        filenames = path
-    elif isinstance(path, str):
-        filenames = sorted(glob(path))
-    elif hasattr(path, "__fspath__"):
-        filenames = sorted(glob(path.__fspath__()))
-    else:
-        raise TypeError(f"Path type not understood:{type(path)}")
-
-    if not filenames:
-        msg = f"A file in: {filenames} does not exist."
-        raise FileNotFoundError(msg)
-
-    if chunksize and byte_range:
-        raise ValueError("Cannot specify both chunksize and byte_range.")
-
-    if chunksize:
-        sources = []
-        for fn in filenames:
-            size = os.path.getsize(fn)
-            for start in range(0, size, chunksize):
-                byte_range = (
-                    start,
-                    chunksize,
-                )  # specify which chunk of the file we care about
-                sources.append((fn, byte_range))
-    else:
-        sources = [(fn, byte_range) for fn in filenames]
-
-    return dd.from_map(
-        _read_text,
-        sources,
-        meta=cudf.Series([], dtype="O"),
-        **kwargs,
-    )
diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py
deleted file mode 100644
index a2ba4d1878e..00000000000
--- a/python/dask_cudf/dask_cudf/_legacy/sorting.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-import warnings
-from collections.abc import Iterator
-from functools import wraps
-
-import cupy
-import numpy as np
-import tlz as toolz
-
-from dask import config
-from dask.base import tokenize
-from dask.dataframe import methods
-from dask.dataframe.core import DataFrame, Index, Series
-from dask.dataframe.shuffle import rearrange_by_column
-from dask.highlevelgraph import HighLevelGraph
-from dask.utils import M
-
-import cudf
-from cudf.api.types import _is_categorical_dtype
-from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
-
-_SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
-
-
-def _deprecate_shuffle_kwarg(func):
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        old_arg_value = kwargs.pop("shuffle", None)
-
-        if old_arg_value is not None:
-            new_arg_value = old_arg_value
-            msg = (
-                "the 'shuffle' keyword is deprecated, "
-                "use 'shuffle_method' instead."
-            )
-
-            warnings.warn(msg, FutureWarning)
-            if kwargs.get("shuffle_method") is not None:
-                msg = (
-                    "Can only specify 'shuffle' "
-                    "or 'shuffle_method', not both."
-                )
-                raise TypeError(msg)
-            kwargs["shuffle_method"] = new_arg_value
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-@_dask_cudf_performance_tracking
-def set_index_post(df, index_name, drop, column_dtype):
-    df2 = df.set_index(index_name, drop=drop)
-    df2.columns = df2.columns.astype(column_dtype)
-    return df2
-
-
-@_dask_cudf_performance_tracking
-def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
-    if ascending:
-        partitions = divisions.searchsorted(s, side="right") - 1
-    else:
-        partitions = (
-            len(divisions) - divisions.searchsorted(s, side="right") - 1
-        )
-    partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = (
-        0 if ascending else (len(divisions) - 2)
-    )
-    partitions[s._columns[0].isnull().values] = (
-        len(divisions) - 2 if na_position == "last" else 0
-    )
-    return partitions
-
-
-@_dask_cudf_performance_tracking
-def _quantile(a, q):
-    n = len(a)
-    if not len(a):
-        return None, n
-    return (
-        a.quantile(q=q.tolist(), interpolation="nearest", method="table"),
-        n,
-    )
-
-
-@_dask_cudf_performance_tracking
-def merge_quantiles(finalq, qs, vals):
-    """Combine several quantile calculations of different data.
-    [NOTE: Same logic as dask.array merge_percentiles]
-    """
-    if isinstance(finalq, Iterator):
-        finalq = list(finalq)
-    finalq = np.array(finalq)
-    qs = list(map(list, qs))
-    vals = list(vals)
-    vals, Ns = zip(*vals)
-    Ns = list(Ns)
-
-    L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N]))
-    if not L:
-        raise ValueError("No non-trivial arrays found")
-    qs, vals, Ns = L
-
-    if len(vals) != len(qs) or len(Ns) != len(qs):
-        raise ValueError("qs, vals, and Ns parameters must be the same length")
-
-    # transform qs and Ns into number of observations between quantiles
-    counts = []
-    for q, N in zip(qs, Ns):
-        count = np.empty(len(q))
-        count[1:] = np.diff(q)
-        count[0] = q[0]
-        count *= N
-        counts.append(count)
-
-    def _append_counts(val, count):
-        val["_counts"] = count
-        return val
-
-    # Sort by calculated quantile values, then number of observations.
-    combined_vals_counts = cudf.core.reshape._merge_sorted(
-        [*map(_append_counts, vals, counts)]
-    )
-    combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values)
-    combined_vals = combined_vals_counts.drop(columns=["_counts"])
-
-    # quantile-like, but scaled by total number of observations
-    combined_q = np.cumsum(combined_counts)
-
-    # rescale finalq quantiles to match combined_q
-    desired_q = finalq * sum(Ns)
-
-    # TODO: Support other interpolation methods
-    # For now - Always use "nearest" for interpolation
-    left = np.searchsorted(combined_q, desired_q, side="left")
-    right = np.searchsorted(combined_q, desired_q, side="right") - 1
-    np.minimum(left, len(combined_vals) - 1, left)  # don't exceed max index
-    lower = np.minimum(left, right)
-    upper = np.maximum(left, right)
-    lower_residual = np.abs(combined_q[lower] - desired_q)
-    upper_residual = np.abs(combined_q[upper] - desired_q)
-    mask = lower_residual > upper_residual
-    index = lower  # alias; we no longer need lower
-    index[mask] = upper[mask]
-    rv = combined_vals.iloc[index]
-    return rv.reset_index(drop=True)
-
-
-@_dask_cudf_performance_tracking
-def _approximate_quantile(df, q):
-    """Approximate quantiles of DataFrame or Series.
-    [NOTE: Same logic as dask.dataframe Series quantile]
-    """
-    # current implementation needs q to be sorted so
-    # sort if array-like, otherwise leave it alone
-    q_ndarray = np.array(q)
-    if q_ndarray.ndim > 0:
-        q_ndarray.sort(kind="mergesort")
-        q = q_ndarray
-
-    # Lets assume we are dealing with a DataFrame throughout
-    if isinstance(df, (Series, Index)):
-        df = df.to_frame()
-    assert isinstance(df, DataFrame)
-    final_type = df._meta._constructor
-
-    # Create metadata
-    meta = df._meta_nonempty.quantile(q=q, method="table")
-
-    # Define final action (create df with quantiles as index)
-    def finalize_tsk(tsk):
-        return (final_type, tsk)
-
-    return_type = df.__class__
-
-    # pandas/cudf uses quantile in [0, 1]
-    # numpy / cupy uses [0, 100]
-    qs = np.asarray(q)
-    token = tokenize(df, qs)
-
-    if len(qs) == 0:
-        name = "quantiles-" + token
-        empty_index = cudf.Index([], dtype=float)
-        return Series(
-            {
-                (name, 0): final_type(
-                    {col: [] for col in df.columns},
-                    name=df.name,
-                    index=empty_index,
-                )
-            },
-            name,
-            df._meta,
-            [None, None],
-        )
-    else:
-        new_divisions = [np.min(q), np.max(q)]
-
-    name = "quantiles-1-" + token
-    val_dsk = {
-        (name, i): (_quantile, key, qs)
-        for i, key in enumerate(df.__dask_keys__())
-    }
-
-    name2 = "quantiles-2-" + token
-    merge_dsk = {
-        (name2, 0): finalize_tsk(
-            (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk))
-        )
-    }
-    dsk = toolz.merge(val_dsk, merge_dsk)
-    graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df])
-    df = return_type(graph, name2, meta, new_divisions)
-
-    def set_quantile_index(df):
-        df.index = q
-        return df
-
-    df = df.map_partitions(set_quantile_index, meta=meta)
-    return df
-
-
-@_dask_cudf_performance_tracking
-def quantile_divisions(df, by, npartitions):
-    qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
-    divisions = _approximate_quantile(df[by], qn).compute()
-    columns = divisions.columns
-
-    # TODO: Make sure divisions are correct for all dtypes..
-    if (
-        len(columns) == 1
-        and df[columns[0]].dtype != "object"
-        and not _is_categorical_dtype(df[columns[0]].dtype)
-    ):
-        dtype = df[columns[0]].dtype
-        divisions = divisions[columns[0]].astype("int64")
-        divisions.iloc[-1] += 1
-        divisions = sorted(
-            divisions.drop_duplicates().astype(dtype).to_arrow().tolist(),
-            key=lambda x: (x is None, x),
-        )
-    else:
-        for col in columns:
-            dtype = df[col].dtype
-            if dtype != "object":
-                divisions[col] = divisions[col].astype("int64")
-                divisions[col].iloc[-1] += 1
-                divisions[col] = divisions[col].astype(dtype)
-            else:
-                if last := divisions[col].iloc[-1]:
-                    val = chr(ord(last[0]) + 1)
-                else:
-                    val = "this string intentionally left empty"  # any but ""
-                divisions[col].iloc[-1] = val
-        divisions = divisions.drop_duplicates().sort_index()
-    return divisions
-
-
-@_deprecate_shuffle_kwarg
-@_dask_cudf_performance_tracking
-def sort_values(
-    df,
-    by,
-    max_branch=None,
-    divisions=None,
-    set_divisions=False,
-    ignore_index=False,
-    ascending=True,
-    na_position="last",
-    shuffle_method=None,
-    sort_function=None,
-    sort_function_kwargs=None,
-):
-    """Sort by the given list/tuple of column names."""
-
-    if not isinstance(ascending, bool):
-        raise ValueError("ascending must be either True or False")
-    if na_position not in ("first", "last"):
-        raise ValueError("na_position must be either 'first' or 'last'")
-
-    npartitions = df.npartitions
-    if isinstance(by, tuple):
-        by = list(by)
-    elif not isinstance(by, list):
-        by = [by]
-
-    # parse custom sort function / kwargs if provided
-    sort_kwargs = {
-        "by": by,
-        "ascending": ascending,
-        "na_position": na_position,
-    }
-    if sort_function is None:
-        sort_function = M.sort_values
-    if sort_function_kwargs is not None:
-        sort_kwargs.update(sort_function_kwargs)
-
-    # handle single partition case
-    if npartitions == 1:
-        return df.map_partitions(sort_function, **sort_kwargs)
-
-    # Step 1 - Calculate new divisions (if necessary)
-    if divisions is None:
-        divisions = quantile_divisions(df, by, npartitions)
-
-    # Step 2 - Perform repartitioning shuffle
-    meta = df._meta._constructor_sliced([0])
-    if not isinstance(divisions, (cudf.Series, cudf.DataFrame)):
-        dtype = df[by[0]].dtype
-        divisions = df._meta._constructor_sliced(divisions, dtype=dtype)
-
-    partitions = df[by].map_partitions(
-        _set_partitions_pre,
-        divisions=divisions,
-        ascending=ascending,
-        na_position=na_position,
-        meta=meta,
-    )
-
-    df2 = df.assign(_partitions=partitions)
-    df3 = rearrange_by_column(
-        df2,
-        "_partitions",
-        max_branch=max_branch,
-        npartitions=len(divisions) - 1,
-        shuffle_method=_get_shuffle_method(shuffle_method),
-        ignore_index=ignore_index,
-    ).drop(columns=["_partitions"])
-    df3.divisions = (None,) * (df3.npartitions + 1)
-
-    # Step 3 - Return final sorted df
-    df4 = df3.map_partitions(sort_function, **sort_kwargs)
-    if not isinstance(divisions, cudf.DataFrame) and set_divisions:
-        # Can't have multi-column divisions elsewhere in dask (yet)
-        df4.divisions = tuple(methods.tolist(divisions))
-
-    return df4
-
-
-def get_default_shuffle_method():
-    # Note that `dask.utils.get_default_shuffle_method`
-    # will return "p2p" by default when a distributed
-    # client is present. Dask-cudf supports "p2p", but
-    # will not use it by default (yet)
-    default = config.get("dataframe.shuffle.method", "tasks")
-    if default not in _SHUFFLE_SUPPORT:
-        default = "tasks"
-    return default
-
-
-def _get_shuffle_method(shuffle_method):
-    # Utility to set the shuffle_method-kwarg default
-    # and to validate user-specified options
-    shuffle_method = shuffle_method or get_default_shuffle_method()
-    if shuffle_method not in _SHUFFLE_SUPPORT:
-        raise ValueError(
-            "Dask-cudf only supports the following shuffle "
-            f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}"
-        )
-
-    return shuffle_method
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index fceaaf185e8..f33733d9583 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import warnings
 from collections.abc import Iterator
@@ -11,14 +11,12 @@
 from packaging.version import Version
 from pandas.api.types import is_scalar
 
-import dask.dataframe as dd
 from dask import config
 from dask.array.dispatch import percentile_lookup
 from dask.dataframe.backends import (
     DataFrameBackendEntrypoint,
     PandasBackendEntrypoint,
 )
-from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
     concat_dispatch,
@@ -28,6 +26,8 @@
     hash_object_dispatch,
     is_categorical_dtype_dispatch,
     make_meta_dispatch,
+    meta_nonempty,
+    partd_encode_dispatch,
     pyarrow_schema_dispatch,
     to_pyarrow_table_dispatch,
     tolist_dispatch,
@@ -46,13 +46,6 @@
 from cudf.api.types import is_string_dtype
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
-from ._legacy.core import DataFrame, Index, Series
-
-get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame)
-get_parallel_type.register(cudf.Series, lambda _: Series)
-get_parallel_type.register(cudf.BaseIndex, lambda _: Index)
-
-
 # Required for Arrow filesystem support in read_parquet
 PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0")
 
@@ -318,7 +311,7 @@ def tolist_cudf(obj):
 
 
 @is_categorical_dtype_dispatch.register(
-    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
+    (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype)  # , Series)
 )
 @_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
@@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
 
-# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0
-try:
-    from dask.dataframe.dispatch import partd_encode_dispatch
-
-    @partd_encode_dispatch.register(cudf.DataFrame)
-    def _simple_cudf_encode(_):
-        # Basic pickle-based encoding for a partd k-v store
-        import pickle
+@partd_encode_dispatch.register(cudf.DataFrame)
+def _simple_cudf_encode(_):
+    # Basic pickle-based encoding for a partd k-v store
+    import pickle
 
-        import partd
+    import partd
 
-        def join(dfs):
-            if not dfs:
-                return cudf.DataFrame()
-            else:
-                return cudf.concat(dfs)
-
-        dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
-        return partial(partd.Encode, dumps, pickle.loads, join)
+    def join(dfs):
+        if not dfs:
+            return cudf.DataFrame()
+        else:
+            return cudf.concat(dfs)
 
-except ImportError:
-    pass
+    dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL)
+    return partial(partd.Encode, dumps, pickle.loads, join)
 
 
 def _default_backend(func, *args, **kwargs):
@@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs):
     return data
 
 
-# Define "cudf" backend engine to be registered with Dask
-class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
-    """Backend-entrypoint class for Dask-DataFrame
+# Define the "cudf" backend for "legacy" Dask DataFrame
+class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint):
+    """Backend-entrypoint class for legacy Dask-DataFrame
 
     This class is registered under the name "cudf" for the
-    ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
-    Dask-DataFrame will use the methods defined in this class
-    in place of ``dask.dataframe.<creation-method>`` when the
-    "dataframe.backend" configuration is set to "cudf":
-
-    Examples
-    --------
-    >>> import dask
-    >>> import dask.dataframe as dd
-    >>> with dask.config.set({"dataframe.backend": "cudf"}):
-    ...     ddf = dd.from_dict({"a": range(10)})
-    >>> type(ddf)
-    <class 'dask_cudf._legacy.core.DataFrame'>
+    ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``.
+    This "legacy" backend is only used for CSV support.
     """
 
-    @classmethod
-    def to_backend_dispatch(cls):
-        return to_cudf_dispatch
-
-    @classmethod
-    def to_backend(cls, data: dd.core._Frame, **kwargs):
-        if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)):
-            # Already a cudf-backed collection
-            _unsupported_kwargs("cudf", "cudf", kwargs)
-            return data
-        return data.map_partitions(cls.to_backend_dispatch(), **kwargs)
-
-    @staticmethod
-    def from_dict(
-        data,
-        npartitions,
-        orient="columns",
-        dtype=None,
-        columns=None,
-        constructor=cudf.DataFrame,
-    ):
-        return _default_backend(
-            dd.from_dict,
-            data,
-            npartitions=npartitions,
-            orient=orient,
-            dtype=dtype,
-            columns=columns,
-            constructor=constructor,
-        )
-
-    @staticmethod
-    def read_parquet(*args, engine=None, **kwargs):
-        from dask_cudf._legacy.io.parquet import CudfEngine
-
-        _raise_unsupported_parquet_kwargs(**kwargs)
-        return _default_backend(
-            dd.read_parquet,
-            *args,
-            engine=CudfEngine,
-            **kwargs,
-        )
-
-    @staticmethod
-    def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json
-
-        return read_json(*args, **kwargs)
 
-    @staticmethod
-    def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io import read_orc
-
-        return read_orc(*args, **kwargs)
-
-    @staticmethod
-    def read_csv(*args, **kwargs):
-        from dask_cudf._legacy.io import read_csv
-
-        return read_csv(*args, **kwargs)
-
-    @staticmethod
-    def read_hdf(*args, **kwargs):
-        # HDF5 reader not yet implemented in cudf
-        warnings.warn(
-            "read_hdf is not yet implemented in cudf/dask_cudf. "
-            "Moving to cudf from pandas. Expect poor performance!"
-        )
-        return _default_backend(dd.read_hdf, *args, **kwargs).to_backend(
-            "cudf"
-        )
-
-
-# Define "cudf" backend entrypoint for dask-expr
-class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
+# Define the "cudf" backend for expr-based Dask DataFrame
+class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
     """Backend-entrypoint class for Dask-Expressions
 
     This class is registered under the name "cudf" for the
-    ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``.
+    ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``.
     Dask-DataFrame will use the methods defined in this class
     in place of ``dask_expr.<creation-method>`` when the
     "dataframe.backend" configuration is set to "cudf":
@@ -746,12 +649,12 @@ def read_csv(
 
     @staticmethod
     def read_json(*args, **kwargs):
-        from dask_cudf._legacy.io.json import read_json as read_json_impl
+        from dask_cudf.io.json import read_json as read_json_impl
 
         return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
-        from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
 
         return legacy_read_orc(*args, **kwargs)
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 5fd217209ec..32461104ef9 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -1,56 +1,41 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import textwrap
+import warnings
+from importlib import import_module
 
 import dask.dataframe as dd
-from dask.tokenize import tokenize
 
 import cudf
 from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 # This module provides backward compatibility for legacy import patterns.
-if dd.DASK_EXPR_ENABLED:
-    from dask_cudf._expr.collection import (
-        DataFrame,
-        Index,
-        Series,
-    )
-else:
-    from dask_cudf._legacy.core import DataFrame, Index, Series  # noqa: F401
-
+from dask_cudf._expr.collection import (
+    DataFrame,  # noqa: F401
+    Index,  # noqa: F401
+    Series,  # noqa: F401
+)
 
 concat = dd.concat
 
 
 @_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
-    from dask_cudf import QUERY_PLANNING_ON
-
     if isinstance(getattr(data, "index", None), cudf.MultiIndex):
         raise NotImplementedError(
             "dask_cudf does not support MultiIndex Dataframes."
         )
 
-    # Dask-expr doesn't support the `name` argument
-    name = {}
-    if not QUERY_PLANNING_ON:
-        name = {
-            "name": name
-            or ("from_cudf-" + tokenize(data, npartitions or chunksize))
-        }
-
     return dd.from_pandas(
         data,
         npartitions=npartitions,
         chunksize=chunksize,
         sort=sort,
-        **name,
     )
 
 
-from_cudf.__doc__ = (
-    textwrap.dedent(
-        """
+from_cudf.__doc__ = textwrap.dedent(
+    """
         Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`.
 
         This function is a thin wrapper around
@@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
         arguments (described below) excepting that it operates on cuDF
         rather than pandas objects.\n
         """
-    )
-    # TODO: `dd.from_pandas.__doc__` is empty when
-    # `DASK_DATAFRAME__QUERY_PLANNING=True`
-    # since dask-expr does not provide a docstring for from_pandas.
-    + textwrap.dedent(dd.from_pandas.__doc__ or "")
-)
+) + textwrap.dedent(dd.from_pandas.__doc__)
+
+
+def _deprecated_api(old_api, new_api=None, rec=None):
+    def inner_func(*args, **kwargs):
+        if new_api:
+            # Use alternative
+            msg = f"{old_api} is now deprecated. "
+            msg += rec or f"Please use {new_api} instead."
+            warnings.warn(msg, FutureWarning)
+            new_attr = new_api.split(".")
+            module = import_module(".".join(new_attr[:-1]))
+            return getattr(module, new_attr[-1])(*args, **kwargs)
+
+        # No alternative - raise an error
+        raise NotImplementedError(
+            f"{old_api} is no longer supported. " + (rec or "")
+        )
+
+    return inner_func
diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py
index 9bca33e414a..a5175c9bbe7 100644
--- a/python/dask_cudf/dask_cudf/io/__init__.py
+++ b/python/dask_cudf/dask_cudf/io/__init__.py
@@ -1,6 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
+from dask_cudf.core import _deprecated_api
 
 from . import csv, json, orc, parquet, text  # noqa: F401
 
@@ -15,20 +15,13 @@
 )
 to_orc = _deprecated_api(
     "dask_cudf.io.to_orc",
-    new_api="dask_cudf._legacy.io.to_orc",
+    new_api="dask_cudf.io.orc.to_orc",
     rec="Please use the DataFrame.to_orc method instead.",
 )
 read_text = _deprecated_api(
     "dask_cudf.io.read_text", new_api="dask_cudf.read_text"
 )
-if QUERY_PLANNING_ON:
-    read_parquet = parquet.read_parquet
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = parquet.read_parquet
 to_parquet = _deprecated_api(
     "dask_cudf.io.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py
index 29f98b14511..e36ee04d827 100644
--- a/python/dask_cudf/dask_cudf/io/csv.py
+++ b/python/dask_cudf/dask_cudf/io/csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import os
 from glob import glob
@@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs):
     >>> import dask_cudf
     >>> df = dask_cudf.read_csv("myfiles.*.csv")
 
-    In some cases it can break up large files:
+    It can break up large files if blocksize is specified:
 
     >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB")
 
-    It can read CSV files from external resources (e.g. S3, HTTP, FTP)
+    It can read CSV files from external resources (e.g. S3, HTTP, FTP):
 
     >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv")
     >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv")
@@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs):
     ----------
     path : str, path object, or file-like object
         Either a path to a file (a str, :py:class:`pathlib.Path`, or
-        py._path.local.LocalPath), URL (including http, ftp, and S3
-        locations), or any object with a read() method (such as
+        ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3
+        locations), or any object with a ``read()`` method (such as
         builtin :py:func:`open` file handler function or
         :py:class:`~io.StringIO`).
     blocksize : int or str, default "256 MiB"
         The target task partition size. If ``None``, a single block
         is used for each file.
     **kwargs : dict
-        Passthrough key-word arguments that are sent to
+        Passthrough keyword arguments that are sent to
         :func:`cudf:cudf.read_csv`.
 
     Notes
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 8f85ea54c0a..3022ebb2a5b 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,8 +1,209 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+from functools import partial
 
-read_json = _deprecated_api(
-    "dask_cudf.io.json.read_json",
-    new_api="dask_cudf.read_json",
-)
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
+import dask
+from dask.utils import parse_bytes
+
+import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
+
+from dask_cudf.backends import _default_backend
+
+
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
+    """Read JSON data into a :class:`.DataFrame`.
+
+    This function wraps :func:`dask.dataframe.read_json`, and passes
+    ``engine=partial(cudf.read_json, engine="auto")`` by default.
+
+    Parameters
+    ----------
+    url_path : str, list of str
+        Location to read from. If a string, can include a glob character to
+        find a set of file names.
+        Supports protocol specifications such as ``"s3://"``.
+    engine : str or Callable, default "auto"
+
+        If str, this value will be used as the ``engine`` argument
+        when :func:`cudf.read_json` is used to create each partition.
+        If a :obj:`~collections.abc.Callable`, this value will be used as the
+        underlying function used to create each partition from JSON
+        data. The default value is "auto", so that
+        ``engine=partial(cudf.read_json, engine="auto")`` will be
+        passed to :func:`dask.dataframe.read_json` by default.
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
+    **kwargs :
+        Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
+
+    Returns
+    -------
+    :class:`.DataFrame`
+
+    Examples
+    --------
+    Load single file
+
+    >>> from dask_cudf import read_json
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    Load large line-delimited JSON files using partitions of approx
+    256MB size
+
+    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
+
+    Load nested JSON data
+
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    See Also
+    --------
+    dask.dataframe.read_json
+
+    """
+
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
+    return _default_backend(
+        dask.dataframe.read_json,
+        url_path,
+        engine=(
+            partial(cudf.read_json, engine=engine)
+            if isinstance(engine, str)
+            else engine
+        ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 5219cdacc31..5de28751912 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,13 +1,195 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from dask_cudf import _deprecated_api
-
-read_orc = _deprecated_api(
-    "dask_cudf.io.orc.read_orc",
-    new_api="dask_cudf.read_orc",
-)
-to_orc = _deprecated_api(
-    "dask_cudf.io.orc.to_orc",
-    new_api="dask_cudf._legacy.io.orc.to_orc",
-    rec="Please use the DataFrame.to_orc method instead.",
-)
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+
+from io import BufferedWriter, IOBase
+
+from fsspec.core import get_fs_token_paths
+from fsspec.utils import stringify_path
+from pyarrow import orc as orc
+
+from dask import dataframe as dd
+from dask.dataframe.io.utils import _get_pyarrow_dtypes
+
+import cudf
+
+
+def _read_orc_stripe(source, fs, columns=None, kwargs=None):
+    """Pull out specific columns from specific stripe"""
+    path, stripe = source
+    if kwargs is None:
+        kwargs = {}
+    with fs.open(path, "rb") as f:
+        df_stripe = cudf.read_orc(
+            f, stripes=[stripe], columns=columns, **kwargs
+        )
+    return df_stripe
+
+
+def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
+    """Read ORC files into a :class:`.DataFrame`.
+
+    Note that this function is mostly borrowed from upstream Dask.
+
+    Parameters
+    ----------
+    path : str or list[str]
+        Location of file(s), which can be a full URL with protocol specifier,
+        and may include glob character if a single string.
+    columns : None or list[str]
+        Columns to load. If None, loads all.
+    filters : None or list of tuple or list of lists of tuples
+        If not None, specifies a filter predicate used to filter out
+        row groups using statistics stored for each row group as
+        Parquet metadata. Row groups that do not match the given
+        filter predicate are not read. The predicate is expressed in
+        `disjunctive normal form (DNF)
+        <https://en.wikipedia.org/wiki/Disjunctive_normal_form>`__
+        like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary
+        boolean logical combinations of single column predicates. The
+        innermost tuples each describe a single column predicate. The
+        list of inner predicates is interpreted as a conjunction
+        (AND), forming a more selective and multiple column predicate.
+        Finally, the outermost list combines these filters as a
+        disjunction (OR). Predicates may also be passed as a list of
+        tuples. This form is interpreted as a single conjunction. To
+        express OR in predicates, one must use the (preferred)
+        notation of list of lists of tuples.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+
+    See Also
+    --------
+    dask.dataframe.read_orc
+
+    Returns
+    -------
+    dask_cudf.DataFrame
+
+    """
+
+    storage_options = storage_options or {}
+    fs, _, paths = get_fs_token_paths(
+        path, mode="rb", storage_options=storage_options
+    )
+    schema = None
+    nstripes_per_file = []
+    for path in paths:
+        with fs.open(path, "rb") as f:
+            o = orc.ORCFile(f)
+            if schema is None:
+                schema = o.schema
+            elif schema != o.schema:
+                raise ValueError(
+                    "Incompatible schemas while parsing ORC files"
+                )
+            nstripes_per_file.append(o.nstripes)
+    schema = _get_pyarrow_dtypes(schema, categories=None)
+    if columns is not None:
+        ex = set(columns) - set(schema)
+        if ex:
+            raise ValueError(
+                f"Requested columns ({ex}) not in schema ({set(schema)})"
+            )
+    else:
+        columns = list(schema)
+
+    with fs.open(paths[0], "rb") as f:
+        meta = cudf.read_orc(
+            f,
+            stripes=[0] if nstripes_per_file[0] else None,
+            columns=columns,
+            **kwargs,
+        )
+
+    sources = []
+    for path, n in zip(paths, nstripes_per_file):
+        for stripe in (
+            range(n)
+            if filters is None
+            else cudf.io.orc._filter_stripes(filters, path)
+        ):
+            sources.append((path, stripe))
+
+    return dd.from_map(
+        _read_orc_stripe,
+        sources,
+        args=[fs],
+        columns=columns,
+        kwargs=kwargs,
+        meta=meta,
+    )
+
+
+def write_orc_partition(df, path, fs, filename, compression="snappy"):
+    full_path = fs.sep.join([path, filename])
+    with fs.open(full_path, mode="wb") as out_file:
+        if not isinstance(out_file, IOBase):
+            out_file = BufferedWriter(out_file)
+        cudf.io.to_orc(df, out_file, compression=compression)
+    return full_path
+
+
+def to_orc(
+    df,
+    path,
+    write_index=True,
+    storage_options=None,
+    compression="snappy",
+    compute=True,
+    **kwargs,
+):
+    """
+    Write a :class:`.DataFrame` to ORC file(s) (one file per partition).
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : str or pathlib.Path
+        Destination directory for data.  Prepend with protocol like ``s3://``
+        or ``hdfs://`` for remote data.
+    write_index : boolean, optional
+        Whether or not to write the index. Defaults to True.
+    storage_options : None or dict
+        Further parameters to pass to the bytes backend.
+    compression : string or dict, optional
+    compute : bool, optional
+        If True (default) then the result is computed immediately. If
+        False then a :class:`~dask.delayed.Delayed` object is returned
+        for future computation.
+
+    """
+
+    from dask import compute as dask_compute, delayed
+
+    # TODO: Use upstream dask implementation once available
+    #       (see: Dask Issue#5596)
+
+    if hasattr(path, "name"):
+        path = stringify_path(path)
+    fs, _, _ = get_fs_token_paths(
+        path, mode="wb", storage_options=storage_options
+    )
+    # Trim any protocol information from the path before forwarding
+    path = fs._strip_protocol(path)
+
+    if write_index:
+        df = df.reset_index()
+    else:
+        # Not writing index - might as well drop it
+        df = df.reset_index(drop=True)
+
+    fs.mkdirs(path, exist_ok=True)
+
+    # Use i_offset and df.npartitions to define file-name list
+    filenames = ["part.%i.orc" % i for i in range(df.npartitions)]
+
+    # write parts
+    dwrite = delayed(write_orc_partition)
+    parts = [
+        dwrite(d, path, fs, filename, compression=compression)
+        for d, filename in zip(df.to_delayed(), filenames)
+    ]
+
+    if compute:
+        return dask_compute(*parts)
+
+    return delayed(list)(parts)
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ba6209c4820..a953dce787d 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -37,10 +37,9 @@ def TaskList(*x):
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON, _deprecated_api
-
 # Dask-expr imports CudfEngine from this module
 from dask_cudf._legacy.io.parquet import CudfEngine
+from dask_cudf.core import _deprecated_api
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -832,15 +831,8 @@ def read_parquet_expr(
     )
 
 
-if QUERY_PLANNING_ON:
-    read_parquet = read_parquet_expr
-    read_parquet.__doc__ = read_parquet_expr.__doc__
-else:
-    read_parquet = _deprecated_api(
-        "The legacy dask_cudf.io.parquet.read_parquet API",
-        new_api="dask_cudf.read_parquet",
-        rec="",
-    )
+read_parquet = read_parquet_expr
+read_parquet.__doc__ = read_parquet_expr.__doc__
 to_parquet = _deprecated_api(
     "dask_cudf.io.parquet.to_parquet",
     new_api="dask_cudf._legacy.io.parquet.to_parquet",
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index f5509cf91c3..48eca13e16f 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import math
 import os
@@ -11,10 +11,6 @@
 from dask.utils import tmpfile
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
@@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path):
     with pytest.warns(match="dask_cudf.io.read_json is now deprecated"):
         df2 = dask_cudf.io.read_json(path)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"):
-        df2 = dask_cudf.io.json.read_json(path)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index b6064d851ca..4aac463420b 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 
 import glob
 import os
@@ -12,10 +12,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
@@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"):
         df2 = dask_cudf.io.read_orc(paths)
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"):
-        df2 = dask_cudf.io.orc.read_orc(paths)
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6efe6c4f388..9f7031f4d2a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import glob
 import math
@@ -16,11 +16,6 @@
 
 import dask_cudf
 from dask_cudf._legacy.io.parquet import create_metadata_file
-from dask_cudf.tests.utils import (
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 # Check if create_metadata_file is supported by
 # the current dask.dataframe version
@@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
@@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir):
     dd.assert_eq(df, new_ddf)
 
 
-@skip_dask_expr("Not necessary in dask-expr")
-def test_check_file_size(tmpdir):
-    # Test simple file-size check to help warn users
-    # of upstream change to `split_row_groups` default
-    fn = str(tmpdir.join("test.parquet"))
-    cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn)
-    with pytest.warns(match="large parquet file"):
-        # Need to use `dask_cudf._legacy.io` path
-        # TODO: Remove outdated `check_file_size` functionality
-        dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute()
-
-
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
@@ -626,7 +607,6 @@ def test_timezone_column(tmpdir):
     dd.assert_eq(got, expect)
 
 
-@require_dask_expr()
 @pytest.mark.skipif(
     not dask_cudf.backends.PYARROW_GE_15,
     reason="Requires pyarrow 15",
@@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir):
     with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"):
         dask_cudf.io.to_parquet(df, tmpdir)
 
-    if dask_cudf.QUERY_PLANNING_ON:
-        df2 = dask_cudf.io.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-
-        df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-        dd.assert_eq(df, df2, check_divisions=False)
-    else:
-        with pytest.warns(match="legacy dask_cudf.io.read_parquet"):
-            df2 = dask_cudf.io.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
 
-        with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"):
-            df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
-            dd.assert_eq(df, df2, check_divisions=False)
+    df2 = dask_cudf.io.parquet.read_parquet(tmpdir)
+    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 90907f6fb99..7c53b89a883 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
 import os
 import socket
@@ -14,7 +14,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import QUERY_PLANNING_ON
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
@@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises():
         pytest.param(
             "arrow",
             marks=pytest.mark.skipif(
-                not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15,
+                not dask_cudf.backends.PYARROW_GE_15,
                 reason="Not supported",
             ),
         ),
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e35b6411a9d..f4d59334e03 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import os
 
@@ -9,10 +9,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr
-
-# No dask-expr support for dask<2024.4.0
-pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
@@ -42,7 +38,3 @@ def test_deprecated_api_paths():
     with pytest.warns(match="dask_cudf.io.read_text is now deprecated"):
         df2 = dask_cudf.io.read_text(text_file, delimiter=".")
     dd.assert_eq(df, df2, check_divisions=False)
-
-    with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"):
-        df2 = dask_cudf.io.text.read_text(text_file, delimiter=".")
-    dd.assert_eq(df, df2, check_divisions=False)
diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py
index 1caf4e81d8e..eb1d007cc16 100644
--- a/python/dask_cudf/dask_cudf/io/text.py
+++ b/python/dask_cudf/dask_cudf/io/text.py
@@ -1,8 +1,56 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
-from dask_cudf import _deprecated_api
+import os
+from glob import glob
 
-read_text = _deprecated_api(
-    "dask_cudf.io.text.read_text",
-    new_api="dask_cudf.read_text",
-)
+import dask.dataframe as dd
+from dask.utils import parse_bytes
+
+import cudf
+
+
+def _read_text(source, **kwargs):
+    # Wrapper for cudf.read_text operation
+    fn, byte_range = source
+    return cudf.read_text(fn, byte_range=byte_range, **kwargs)
+
+
+def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs):
+    if isinstance(chunksize, str):
+        chunksize = parse_bytes(chunksize)
+
+    if isinstance(path, list):
+        filenames = path
+    elif isinstance(path, str):
+        filenames = sorted(glob(path))
+    elif hasattr(path, "__fspath__"):
+        filenames = sorted(glob(path.__fspath__()))
+    else:
+        raise TypeError(f"Path type not understood:{type(path)}")
+
+    if not filenames:
+        msg = f"A file in: {filenames} does not exist."
+        raise FileNotFoundError(msg)
+
+    if chunksize and byte_range:
+        raise ValueError("Cannot specify both chunksize and byte_range.")
+
+    if chunksize:
+        sources = []
+        for fn in filenames:
+            size = os.path.getsize(fn)
+            for start in range(0, size, chunksize):
+                byte_range = (
+                    start,
+                    chunksize,
+                )  # specify which chunk of the file we care about
+                sources.append((fn, byte_range))
+    else:
+        sources = [(fn, byte_range) for fn in filenames]
+
+    return dd.from_map(
+        _read_text,
+        sources,
+        meta=cudf.Series([], dtype="O"),
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 3fbb2aacd2c..c6b01a648eb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,7 +13,6 @@
 from cudf.testing._utils import does_not_raise
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 #############################################################################
 #                        Datetime Accessor                                  #
@@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7101fb7e00a..31957a106ff 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import random
 
@@ -9,18 +9,12 @@
 
 import dask
 from dask import dataframe as dd
-from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty
+from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty
 from dask.utils import M
 
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    skip_dask_expr,
-    xfail_dask_expr,
-)
 
 rng = np.random.default_rng(seed=0)
 
@@ -299,37 +293,6 @@ def test_set_index_sorted():
             gddf1.set_index("val", sorted=True)
 
 
-@pytest.mark.parametrize("nelem", [10, 200, 1333])
-@pytest.mark.parametrize("index", [None, "myindex"])
-def test_rearrange_by_divisions(nelem, index):
-    with dask.config.set(scheduler="single-threaded"):
-        rng = np.random.default_rng(seed=0)
-        df = pd.DataFrame(
-            {
-                "x": rng.integers(0, 20, size=nelem),
-                "y": rng.normal(size=nelem),
-                "z": rng.choice(["dog", "cat", "bird"], nelem),
-            }
-        )
-        df["z"] = df["z"].astype("category")
-
-        ddf1 = dd.from_pandas(df, npartitions=4)
-        gdf1 = dask_cudf.from_cudf(
-            cudf.DataFrame.from_pandas(df), npartitions=4
-        )
-        ddf1.index.name = index
-        gdf1.index.name = index
-        divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20)
-
-        expect = dd.shuffle.rearrange_by_divisions(
-            ddf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        result = dd.shuffle.rearrange_by_divisions(
-            gdf1, "x", divisions=divisions, shuffle_method="tasks"
-        )
-        dd.assert_eq(expect, result)
-
-
 def test_assign():
     rng = np.random.default_rng(seed=0)
     df = pd.DataFrame(
@@ -393,44 +356,6 @@ def test_setitem_scalar_datetime():
     np.testing.assert_array_equal(got["z"], df["z"])
 
 
-@skip_dask_expr("Not relevant for dask-expr")
-@pytest.mark.parametrize(
-    "func",
-    [
-        lambda: pd.DataFrame(
-            {"A": rng.random(10), "B": rng.random(10)},
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.DataFrame(
-            {
-                "A": rng.random(10),
-                "B": list("a" * 10),
-                "C": pd.Series(
-                    [str(20090101 + i) for i in range(10)],
-                    dtype="datetime64[ns]",
-                ),
-            },
-            index=list("abcdefghij"),
-        ),
-        lambda: pd.Series(list("abcdefghijklmnop")),
-        lambda: pd.Series(
-            rng.random(10),
-            index=pd.Index(
-                [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]"
-            ),
-        ),
-    ],
-)
-def test_repr(func):
-    pdf = func()
-    gdf = cudf.from_pandas(pdf)
-    gddf = dd.from_pandas(gdf, npartitions=3, sort=False)
-
-    assert repr(gddf)
-    if hasattr(pdf, "_repr_html_"):
-        assert gddf._repr_html_()
-
-
 @pytest.mark.skip(reason="datetime indexes not fully supported in cudf")
 @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"])
 @pytest.mark.parametrize("stop", ["1d", "3d", "8h"])
@@ -657,20 +582,20 @@ def test_hash_object_dispatch(index):
     )
 
     # DataFrame
-    result = dd.core.hash_object_dispatch(obj, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # Series
-    result = dd.core.hash_object_dispatch(obj["x"], index=index)
+    result = dd.dispatch.hash_object_dispatch(obj["x"], index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
 
     # DataFrame with MultiIndex
     obj_multi = obj.set_index(["x", "z"], drop=True)
-    result = dd.core.hash_object_dispatch(obj_multi, index=index)
+    result = dd.dispatch.hash_object_dispatch(obj_multi, index=index)
     expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index)
     assert isinstance(result, cudf.Series)
     dd.assert_eq(result, expected)
@@ -784,7 +709,6 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -800,7 +724,6 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -814,7 +737,6 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
@@ -864,7 +786,7 @@ def test_merging_categorical_columns():
 
     ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2)
 
-    ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"])
+    ddf_1 = ddf_1.categorize(columns=["cat_col"])
 
     df_2 = cudf.DataFrame(
         {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]}
@@ -872,7 +794,7 @@ def test_merging_categorical_columns():
 
     ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2)
 
-    ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"])
+    ddf_2 = ddf_2.categorize(columns=["cat_col"])
 
     expected = cudf.DataFrame(
         {
@@ -932,14 +854,9 @@ def func(x):
 
     result = ds.map_partitions(func, meta=s.values)
 
-    if QUERY_PLANNING_ON:
-        # Check Array and round-tripped DataFrame
-        dask.array.assert_eq(result, func(s))
-        dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
-    else:
-        # Legacy version still carries numpy metadata
-        # See: https://github.com/dask/dask/issues/11017
-        dask.array.assert_eq(result.compute(), func(s))
+    # Check Array and round-tripped DataFrame
+    dask.array.assert_eq(result, func(s))
+    dd.assert_eq(result.to_dask_dataframe(), s, check_index=False)
 
 
 def test_implicit_array_conversion_cupy_sparse():
@@ -981,7 +898,6 @@ def test_series_isin_error():
         ddf.isin([1, 5, "a"]).compute()
 
 
-@require_dask_expr()
 def test_to_backend_simplify():
     # Check that column projection is not blocked by to_backend
     with dask.config.set({"dataframe.backend": "pandas"}):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 9bd3b506db0..11ca0c6a783 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -13,12 +13,7 @@
 from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
-from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized
-from dask_cudf.tests.utils import (
-    QUERY_PLANNING_ON,
-    require_dask_expr,
-    xfail_dask_expr,
-)
+from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized
 
 
 def assert_cudf_groupby_layers(ddf):
@@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf):
         expect = getattr(gdf_grouped, aggregation)()
         actual = getattr(ddf_grouped, aggregation)()
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
     dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
     if not series:
         expect = gdf_grouped.agg({"x": aggregation})
         actual = ddf_grouped.agg({"x": aggregation})
 
-        if not QUERY_PLANNING_ON:
-            assert_cudf_groupby_layers(actual)
-
         dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
 
@@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    if not QUERY_PLANNING_ON:
-        assert_cudf_groupby_layers(actual)
-
-        # groupby.agg should add an explicit getitem layer
-        # to improve/enable column projection
-        assert hlg_layer(actual.dask, "getitem")
-
     dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)
 
 
@@ -556,20 +538,13 @@ def test_groupby_categorical_key():
         True,
         pytest.param(
             False,
-            marks=xfail_dask_expr("as_index not supported in dask-expr"),
-        ),
-    ],
-)
-@pytest.mark.parametrize(
-    "fused",
-    [
-        True,
-        pytest.param(
-            False,
-            marks=require_dask_expr("Not supported by legacy API"),
+            marks=pytest.mark.xfail(
+                reason="as_index not supported in dask-expr"
+            ),
         ),
     ],
 )
+@pytest.mark.parametrize("fused", [True, False])
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -590,19 +565,16 @@ def test_groupby_agg_params(
         "c": ["mean", "std", "var"],
     }
 
-    fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {}
+    fused_kwarg = {"fused": fused}
     split_kwargs = {"split_every": split_every, "split_out": split_out}
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
     # Avoid using as_index when query-planning is enabled
-    if QUERY_PLANNING_ON:
-        with pytest.warns(FutureWarning, match="argument is now deprecated"):
-            # Should warn when `as_index` is used
-            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
-        maybe_as_index = {"as_index": as_index} if as_index is False else {}
-    else:
-        maybe_as_index = {"as_index": as_index}
+    with pytest.warns(FutureWarning, match="argument is now deprecated"):
+        # Should warn when `as_index` is used
+        ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+    maybe_as_index = {"as_index": as_index} if as_index is False else {}
 
     # Check `sort=True` behavior
     if split_out == 1:
@@ -671,7 +643,6 @@ def test_groupby_agg_params(
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -711,7 +682,6 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg):
     )
 
 
-@xfail_dask_expr("Co-alignment check fails in dask-expr")
+@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr")
 def test_groupby_with_list_of_series():
     df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]})
     gdf = dask_cudf.from_cudf(df, npartitions=2)
@@ -773,7 +743,6 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
@@ -833,7 +802,7 @@ def test_groupby_all_columns(func):
     expect = func(ddf)
     actual = func(gddf)
 
-    dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON)
+    dd.assert_eq(expect, actual, check_names=False)
 
 
 def test_groupby_shuffle():
@@ -870,15 +839,3 @@ def test_groupby_shuffle():
     # NOTE: `shuffle_method=True` should be default
     got = gddf.groupby("a", sort=False).agg(spec, split_out=2)
     dd.assert_eq(expect, got.compute().sort_index())
-
-    if not QUERY_PLANNING_ON:
-        # Sorted aggregation fails with split_out>1 when shuffle is False
-        # (sort=True, split_out=2, shuffle_method=False)
-        with pytest.raises(ValueError):
-            gddf.groupby("a", sort=True).agg(
-                spec, shuffle_method=False, split_out=2
-            )
-
-        # Check shuffle kwarg deprecation
-        with pytest.warns(match="'shuffle' keyword is deprecated"):
-            gddf.groupby("a", sort=True).agg(spec, shuffle=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 0b7c7855e07..2d05345bc4a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import pandas as pd
 import pytest
@@ -8,12 +8,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
-
-# No dask-expr support
-pytestmark = xfail_dask_expr(
-    "Newer dask version needed", lt_version="2024.5.0"
-)
 
 
 def test_get_dummies_cat():
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 02c815427f3..68d6e72660e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import cupy as cp
 import numpy as np
@@ -10,7 +10,6 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -67,7 +66,6 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index b44b3f939e7..ef6765f39d1 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -1,22 +1,12 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
-import pytest
-from packaging.version import Version
 
-import dask
 import dask.dataframe as dd
 
 import cudf
 
-from dask_cudf import QUERY_PLANNING_ON
-
-if QUERY_PLANNING_ON:
-    DASK_VERSION = Version(dask.__version__)
-else:
-    DASK_VERSION = None
-
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
     rng = np.random.default_rng(seed=0)
@@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
     gdf = cudf.DataFrame.from_pandas(df)
     dgf = dd.from_pandas(gdf, npartitions=npartitions)
     return df, dgf
-
-
-_default_reason = "Not compatible with dask-expr"
-
-
-def skip_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        skip = QUERY_PLANNING_ON
-    return pytest.mark.skipif(skip, reason=reason)
-
-
-def xfail_dask_expr(reason=_default_reason, lt_version=None):
-    if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
-    else:
-        xfail = QUERY_PLANNING_ON
-    return pytest.mark.xfail(xfail, reason=reason)
-
-
-def require_dask_expr(reason="requires dask-expr"):
-    return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason)
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index a8cb696d7f6..b88816a3d47 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -39,10 +39,10 @@ classifiers = [
 ]
 
 [project.entry-points."dask.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfBackendEntrypoint"
+cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint"
 
 [project.entry-points."dask_expr.dataframe.backends"]
-cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
+cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
@@ -102,8 +102,5 @@ filterwarnings = [
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
     "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
-    # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437
-    # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False`
-    "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning",
 ]
 xfail_strict = true
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 5f9a04d3cee..259492b98d1 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -34,9 +34,6 @@ endif()
 
 unset(cudf_FOUND)
 
-# Find Python early so that later commands can use it
-find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
-
 set(BUILD_TESTS OFF)
 set(BUILD_BENCHMARKS OFF)
 set(CUDF_BUILD_TESTUTIL OFF)
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
index 2d070ddda69..fbd478f963f 100644
--- a/python/pylibcudf/pylibcudf/hashing.pxd
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128(
     uint64_t seed=*
 )
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=*
+)
 
 cpdef Column xxhash_64(
     Table input,
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
index a849f5d0729..d535d842a18 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyi
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int]
 
 def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ...
 def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ...
+def xxhash_32(input: Table, seed: int = ...) -> Column: ...
 def xxhash_64(input: Table, seed: int = ...) -> Column: ...
 def md5(input: Table) -> Column: ...
 def sha1(input: Table) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index 548cffc0ce8..1f093b20c6b 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport (
     sha256 as cpp_sha256,
     sha384 as cpp_sha384,
     sha512 as cpp_sha512,
+    xxhash_32 as cpp_xxhash_32,
     xxhash_64 as cpp_xxhash_64,
 )
 from pylibcudf.libcudf.table.table cimport table
@@ -30,6 +31,7 @@ __all__ = [
     "sha256",
     "sha384",
     "sha512",
+    "xxhash_32",
     "xxhash_64",
 ]
 
@@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128(
     return Table.from_libcudf(move(c_result))
 
 
+cpdef Column xxhash_32(
+    Table input,
+    uint32_t seed=DEFAULT_HASH_SEED
+):
+    """Computes the xxHash 32-bit hash value of each row in the given table.
+
+    For details, see :cpp:func:`xxhash_32`.
+
+    Parameters
+    ----------
+    input : Table
+        The table of columns to hash
+    seed : uint32_t
+        Optional seed value to use for the hash function
+
+    Returns
+    -------
+    pylibcudf.Column
+        A column where each row is the hash of a row from the input
+    """
+
+    cdef unique_ptr[column] c_result
+    with  nogil:
+        c_result = cpp_xxhash_32(
+            input.view(),
+            seed
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
 cpdef Column xxhash_64(
     Table input,
     uint64_t seed=DEFAULT_HASH_SEED
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 4e8a01b41a5..46fdf62cd6b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
@@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
         const table_view& input
     ) except +libcudf_exception_handler
 
+    cdef unique_ptr[column] xxhash_32(
+        const table_view& input,
+        const uint32_t seed
+    ) except +libcudf_exception_handler
+
     cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed
diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py
index 83fb50fa4ef..7096dbe14ff 100644
--- a/python/pylibcudf/pylibcudf/tests/test_hashing.py
+++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import hashlib
 import struct
@@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0):
 
 
 def hash_combine_32(lhs, rhs):
-    return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))
+    return np.uint32(
+        int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32
+    )
 
 
 def uint_hash_combine_32(lhs, rhs):
@@ -80,22 +82,6 @@ def list_struct_table():
     return data
 
 
-def python_hash_value(x, method):
-    if method == "murmurhash3_x86_32":
-        return libcudf_mmh3_x86_32(x)
-    elif method == "murmurhash3_x64_128":
-        hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED)
-        hasher.update(x)
-        # libcudf returns a tuple of two 64-bit integers
-        return hasher.utupledigest()
-    elif method == "xxhash_64":
-        return xxhash.xxh64(
-            x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
-        ).intdigest()
-    else:
-        return getattr(hashlib, method)(x).hexdigest()
-
-
 @pytest.mark.parametrize(
     "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"]
 )
@@ -115,6 +101,23 @@ def py_hasher(val):
     assert_column_eq(got, expect)
 
 
+def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl):
+    def py_hasher(val):
+        return xxhash.xxh32(
+            scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+        ).intdigest()
+
+    expect = pa.array(
+        [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
+        type=pa.uint32(),
+    )
+    got = plc.hashing.xxhash_32(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
+
+    assert_column_eq(got, expect)
+
+
 def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl):
     def py_hasher(val):
         return xxhash.xxh64(
@@ -125,7 +128,9 @@ def py_hasher(val):
         [py_hasher(val) for val in pa_scalar_input_column.to_pylist()],
         type=pa.uint64(),
     )
-    got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0)
+    got = plc.hashing.xxhash_64(
+        plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED
+    )
 
     assert_column_eq(got, expect)