diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5e2f46714d9..e0b315f34fc 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,10 +8,9 @@ notebooks/ @rapidsai/cudf-python-codeowners python/dask_cudf/ @rapidsai/cudf-dask-codeowners #cmake code owners -cpp/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -**/cmake/ @rapidsai/cudf-cmake-codeowners -*.cmake @rapidsai/cudf-cmake-codeowners +CMakeLists.txt @rapidsai/cudf-cmake-codeowners +**/cmake/ @rapidsai/cudf-cmake-codeowners +*.cmake @rapidsai/cudf-cmake-codeowners #java code owners java/ @rapidsai/cudf-java-codeowners diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index af49942c8cd..d80e4fef0d0 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -1,11 +1,13 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -euo pipefail package_name="libcudf" package_dir="python/libcudf" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + rapids-logger "Generating build requirements" rapids-dependency-file-generator \ @@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index db86721755d..3c6dba72164 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -24,8 +24,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf (dask-expr)" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf" +./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -rapids-logger "pytest dask_cudf (legacy)" -DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . - rapids-logger "pytest cudf_kafka" ./ci/run_cudf_kafka_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index e15949f4bdb..44f430ce98d 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -eou pipefail @@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf (dask-expr)" +rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ +python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ . popd - -# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) -rapids-logger "pytest dask_cudf (legacy)" -pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cb814aa8c0f..9dabe4e8800 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) rapids_cpm_rapids_logger() -rapids_make_logger(cudf EXPORT_SET cudf-exports) +rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN) # find jitify include(cmake/thirdparty/get_jitify.cmake) @@ -461,6 +461,7 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu + src/hash/xxhash_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/arrow_utilities.cpp diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu index 3502cbcea2a..1085b03ac7b 100644 --- a/cpp/benchmarks/join/distinct_join.cu +++ b/cpp/benchmarks/join/distinct_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,13 +23,8 @@ void distinct_inner_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.inner_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.inner_join(probe_input); }; BM_join(state, join); @@ -42,13 +37,8 @@ void distinct_left_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.left_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.left_join(probe_input); }; BM_join(state, join); diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh index 2acc10105cf..9a10163eb15 100644 --- a/cpp/include/cudf/detail/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/distinct_hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,19 +36,24 @@ using cudf::experimental::row::lhs_index_type; using cudf::experimental::row::rhs_index_type; /** - * @brief An comparator adapter wrapping both self comparator and two table comparator + * @brief A custom comparator used for the build table insertion */ -template -struct comparator_adapter { - comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} - - __device__ constexpr auto operator()( +struct always_not_equal { + __device__ constexpr bool operator()( cuco::pair const&, cuco::pair const&) const noexcept { // All build table keys are distinct thus `false` no matter what return false; } +}; + +/** + * @brief An comparator adapter wrapping the two table comparator + */ +template +struct comparator_adapter { + comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} __device__ constexpr auto operator()( cuco::pair const& lhs, @@ -62,56 +67,14 @@ struct comparator_adapter { Equal _d_equal; }; -template -struct hasher_adapter { - hasher_adapter(Hasher const& d_hasher = {}) : _d_hasher{d_hasher} {} - - template - __device__ constexpr auto operator()(cuco::pair const& key) const noexcept - { - return _d_hasher(key.first); - } - - private: - Hasher _d_hasher; -}; - /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions. * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). */ -template -struct distinct_hash_join { - private: - /// Device row equal type - using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter< - cudf::experimental::row::equality::device_row_comparator>; - using hasher = hasher_adapter>; - using probing_scheme_type = cuco::linear_probing<1, hasher>; - using cuco_storage_type = cuco::storage<1>; - - /// Hash table type - using hash_table_type = cuco::static_set, - cuco::extent, - cuda::thread_scope_device, - comparator_adapter, - probing_scheme_type, - cudf::detail::cuco_allocator, - cuco_storage_type>; - - bool _has_nulls; ///< true if nulls are present in either build table or probe table - cudf::null_equality _nulls_equal; ///< whether to consider nulls as equal - cudf::table_view _build; ///< input table to build the hash map - cudf::table_view _probe; ///< input table to probe the hash map - std::shared_ptr - _preprocessed_build; ///< input table preprocssed for row operators - std::shared_ptr - _preprocessed_probe; ///< input table preprocssed for row operators - hash_table_type _hash_table; ///< hash table built on `_build` - +class distinct_hash_join { public: distinct_hash_join() = delete; ~distinct_hash_join() = default; @@ -120,21 +83,28 @@ struct distinct_hash_join { distinct_hash_join& operator=(distinct_hash_join const&) = delete; distinct_hash_join& operator=(distinct_hash_join&&) = delete; + /** + * @brief Hasher adapter used by distinct hash join + */ + struct hasher { + template + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return key.first; + } + }; + /** * @brief Constructor that internally builds the hash table based on the given `build` table. * * @throw cudf::logic_error if the number of columns in `build` table is 0. * * @param build The build table, from which the hash table is built - * @param probe The probe table - * @param has_nulls Flag to indicate if any nulls exist in the `build` table or - * any `probe` table that will be used later for join. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - bool has_nulls, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream); @@ -143,12 +113,36 @@ struct distinct_hash_join { */ std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; /** * @copydoc cudf::distinct_hash_join::left_join */ std::unique_ptr> left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + private: + using probing_scheme_type = cuco::linear_probing<1, hasher>; + using cuco_storage_type = cuco::storage<1>; + + /// Hash table type + using hash_table_type = cuco::static_set, + cuco::extent, + cuda::thread_scope_device, + always_not_equal, + probing_scheme_type, + cudf::detail::cuco_allocator, + cuco_storage_type>; + + bool _has_nested_columns; ///< True if nested columns are present in build and probe tables + cudf::null_equality _nulls_equal; ///< Whether to consider nulls as equal + cudf::table_view _build; ///< Input table to build the hash map + std::shared_ptr + _preprocessed_build; ///< Input table preprocssed for row operators + hash_table_type _hash_table; ///< Hash table built on `_build` }; } // namespace cudf::detail diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index ea2f5d4b6ca..5edbb322231 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 }; * @return `true` if the type is supported by `fixed_point` implementation */ template -constexpr inline auto is_supported_representation_type() +CUDF_HOST_DEVICE constexpr inline auto is_supported_representation_type() { return cuda::std::is_same_v || // cuda::std::is_same_v || // @@ -72,6 +72,24 @@ constexpr inline auto is_supported_representation_type() // Helper functions for `fixed_point` type namespace detail { +/** + * @brief Returns the smaller of the given scales + * + * @param a The left-hand side value to compare + * @param b The right-hand side value to compare + * @return The smaller of the given scales + */ +CUDF_HOST_DEVICE constexpr inline scale_type min(scale_type const& a, scale_type const& b) +{ + // TODO This is a temporary workaround because is not self-contained when + // built with NVRTC 11.8. Replace this with cuda::std::min once the underlying issue is resolved. +#ifdef __CUDA_ARCH__ + return scale_type{min(static_cast(a), static_cast(b))}; +#else + return std::min(a, b); +#endif +} + /** * @brief A function for integer exponentiation by squaring. * @@ -267,12 +285,12 @@ class fixed_point { * @return The `fixed_point` number in base 10 (aka human readable format) */ template >* = nullptr> - explicit constexpr operator U() const + CUDF_HOST_DEVICE explicit constexpr operator U() const { // Cast to the larger of the two types (of U and Rep) before converting to Rep because in // certain cases casting to U before shifting will result in integer overflow (i.e. if U = // int32_t, Rep = int64_t and _value > 2 billion) - auto const value = std::common_type_t(_value); + auto const value = cuda::std::common_type_t(_value); return static_cast(detail::shift(value, scale_type{-_scale})); } @@ -669,7 +687,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator+(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const sum = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -687,7 +705,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator-(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const diff = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -735,7 +753,7 @@ template CUDF_HOST_DEVICE inline bool operator==(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value; } @@ -744,7 +762,7 @@ template CUDF_HOST_DEVICE inline bool operator!=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value; } @@ -753,7 +771,7 @@ template CUDF_HOST_DEVICE inline bool operator<=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value; } @@ -762,7 +780,7 @@ template CUDF_HOST_DEVICE inline bool operator>=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value; } @@ -771,7 +789,7 @@ template CUDF_HOST_DEVICE inline bool operator<(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value; } @@ -780,7 +798,7 @@ template CUDF_HOST_DEVICE inline bool operator>(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value; } @@ -789,7 +807,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator%(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const remainder = lhs.rescaled(scale)._value % rhs.rescaled(scale)._value; return fixed_point{scaled_integer{remainder, scale}}; } diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 307a52cd242..88034b4f804 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,26 @@ std::unique_ptr sha512( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the XXHash_32 hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index 7cb80081a95..f796ff4526e 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,6 +61,11 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr xxhash_32(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::device_async_resource_ref mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh new file mode 100644 index 00000000000..bb6e7f18fbc --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_32 { + using result_type = std::uint32_t; + + CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } + + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const + { + return this->_impl.compute_hash(bytes, size); + } + + private: + template + __device__ constexpr result_type compute(T const& key) const + { + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + cuco::xxhash_32 _impl; +}; + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(bool const& key) const +{ + return this->compute(static_cast(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(float const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + double const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::string_view const& key) const +{ + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal128 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index afefd04d4fa..cc63565eee1 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,13 +34,6 @@ namespace CUDF_EXPORT cudf { -/** - * @brief Enum to indicate whether the distinct join table has nested columns or not - * - * @ingroup column_join - */ -enum class has_nested : bool { YES, NO }; - // forward declaration namespace hashing::detail { @@ -61,7 +54,6 @@ class hash_join; /** * @brief Forward declaration for our distinct hash join */ -template class distinct_hash_join; } // namespace detail @@ -469,20 +461,19 @@ class hash_join { rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - const std::unique_ptr _impl; + std::unique_ptr _impl; }; /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions * + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). + * * @note Behavior is undefined if the build table contains duplicates. * @note All NaNs are considered as equal - * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table */ -// TODO: `HasNested` to be removed via dispatching -template class distinct_hash_join { public: distinct_hash_join() = delete; @@ -496,15 +487,10 @@ class distinct_hash_join { * @brief Constructs a distinct hash join object for subsequent probe calls * * @param build The build table that contains distinct elements - * @param probe The probe table, from which the keys are probed - * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or - * any `probe` table that will be used later for join * @param compare_nulls Controls whether null join-key values should match or not * @param stream CUDA stream used for device memory operations and kernel launches */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls = nullable_join::YES, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = cudf::get_default_stream()); @@ -512,16 +498,18 @@ class distinct_hash_join { * @brief Returns the row indices that can be used to construct the result of performing * an inner join between two tables. @see cudf::inner_join(). * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned indices' device memory. * - * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to + * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to * construct the result of performing an inner join between two tables * with `build` and `probe` as the join keys. */ [[nodiscard]] std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream = cudf::get_default_stream(), + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; /** @@ -532,19 +520,22 @@ class distinct_hash_join { * the row index of the matched row from the build table if there is a match. Otherwise, contains * `JoinNoneValue`. * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. + * * @return A `build_indices` column that can be used to construct the result of * performing a left join between two tables with `build` and `probe` as the join * keys. */ [[nodiscard]] std::unique_ptr> left_join( + cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - using impl_type = typename cudf::detail::distinct_hash_join; ///< Implementation type + using impl_type = cudf::detail::distinct_hash_join; ///< Implementation type std::unique_ptr _impl; ///< Distinct hash join implementation }; diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 6351a84e38f..c1dd79ef14f 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf { * @return The `cudf::type_id` corresponding to the specified type */ template -inline constexpr type_id type_to_id() +CUDF_HOST_DEVICE inline constexpr type_id type_to_id() { return type_id::EMPTY; }; diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 0e31a0b6cf5..2f255e7a07c 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace binops { namespace compiled { @@ -51,7 +53,7 @@ struct type_casted_accessor { { if constexpr (column_device_view::has_element_accessor()) { auto const element = col.element(is_scalar ? 0 : i); - if constexpr (std::is_convertible_v) { + if constexpr (cuda::std::is_convertible_v) { return static_cast(element); } else if constexpr (is_fixed_point() && cuda::std::is_floating_point_v) { return convert_fixed_to_floating(element); @@ -75,7 +77,7 @@ struct typed_casted_writer { FromType val) const { if constexpr (mutable_column_device_view::has_element_accessor() and - std::is_constructible_v) { + cuda::std::is_constructible_v) { col.element(i) = static_cast(val); } else if constexpr (is_fixed_point()) { auto const scale = numeric::scale_type{col.type().scale()}; @@ -109,18 +111,18 @@ struct ops_wrapper { template __device__ void operator()(size_type i) { - if constexpr (std::is_invocable_v) { + if constexpr (cuda::std::is_invocable_v) { TypeCommon x = type_dispatcher(lhs.type(), type_casted_accessor{}, i, lhs, is_lhs_scalar); TypeCommon y = type_dispatcher(rhs.type(), type_casted_accessor{}, i, rhs, is_rhs_scalar); auto result = [&]() { - if constexpr (std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v) { + if constexpr (cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v) { bool output_valid = false; auto result = BinaryOperator{}.template operator()( x, @@ -134,7 +136,7 @@ struct ops_wrapper { return BinaryOperator{}.template operator()(x, y); } // To suppress nvcc warning - return std::invoke_result_t{}; + return cuda::std::invoke_result_t{}; }(); if constexpr (is_bool_result()) out.element(i) = result; @@ -161,16 +163,16 @@ struct ops2_wrapper { __device__ void operator()(size_type i) { if constexpr (!has_common_type_v and - std::is_invocable_v) { + cuda::std::is_invocable_v) { TypeLhs x = lhs.element(is_lhs_scalar ? 0 : i); TypeRhs y = rhs.element(is_rhs_scalar ? 0 : i); auto result = [&]() { - if constexpr (std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v) { + if constexpr (cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v) { bool output_valid = false; auto result = BinaryOperator{}.template operator()( x, @@ -184,7 +186,7 @@ struct ops2_wrapper { return BinaryOperator{}.template operator()(x, y); } // To suppress nvcc warning - return std::invoke_result_t{}; + return cuda::std::invoke_result_t{}; }(); if constexpr (is_bool_result()) out.element(i) = result; diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu new file mode 100644 index 00000000000..40503f7f911 --- /dev/null +++ b/cpp/src/hash/xxhash_32.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(column_device_view const& col, + size_type const row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } + auto const hasher = XXHash_32{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(column_device_view const&, + size_type const, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for XXHash_32"); + } + }; + + Nullate const _check_nulls; + table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column(data_type(type_to_id()), + input.num_rows(), + mask_state::UNALLOCATED, + stream, + mr); + + // Return early if there's nothing to hash + if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } + + bool const nullable = has_nulls(input); + auto const input_view = table_device_view::create(input, stream); + auto output_view = output->mutable_view(); + + // Compute the hash value for each row + thrust::tabulate(rmm::exec_policy(stream), + output_view.begin(), + output_view.end(), + device_row_hasher(nullable, *input_view, seed)); + + return output; +} + +} // namespace detail + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::xxhash_32(input, seed, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 3a4e315348c..ac81dd421fa 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -416,11 +416,11 @@ std::optional is_compression_disabled(compression_type compression, memo_map_lock.unlock(); if (reason.has_value()) { - CUDF_LOG_INFO("nvCOMP is disabled for {} compression; reason: {}", + CUDF_LOG_INFO("nvCOMP is disabled for %s compression; reason: %s", compression_type_name(compression), reason.value()); } else { - CUDF_LOG_INFO("nvCOMP is enabled for {} compression", compression_type_name(compression)); + CUDF_LOG_INFO("nvCOMP is enabled for %s compression", compression_type_name(compression)); } return reason; @@ -445,11 +445,11 @@ std::optional is_decompression_disabled(compression_type compressio memo_map_lock.unlock(); if (reason.has_value()) { - CUDF_LOG_INFO("nvCOMP is disabled for {} decompression; reason: {}", + CUDF_LOG_INFO("nvCOMP is disabled for %s decompression; reason: %s", compression_type_name(compression), reason.value()); } else { - CUDF_LOG_INFO("nvCOMP is enabled for {} decompression", compression_type_name(compression)); + CUDF_LOG_INFO("nvCOMP is enabled for %s decompression", compression_type_name(compression)); } return reason; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index e05353ee822..0d51526d925 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -771,7 +771,7 @@ table_with_metadata read_csv(cudf::io::datasource* source, if (!reader_opts.is_enabled_mangle_dupe_cols()) { for (auto& col_name : column_names) { if (++col_names_counts[col_name] > 1) { - CUDF_LOG_WARN("Multiple columns with name {}; only the first appearance is parsed", + CUDF_LOG_WARN("Multiple columns with name %s; only the first appearance is parsed", col_name); auto const idx = &col_name - column_names.data(); diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 7facc6497ed..469f933f918 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1572b7246c0..1f84d1f81dc 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,6 +132,177 @@ struct orcdec_state_s { } vals; }; +/** + * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group. + * + * This class is used to address a special case, where the first run of the DATA stream spans two + * adjacent row groups and its length is greater than the maximum length allowed to be consumed. + * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be + * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type + * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a + * local variable and does not reside in the shared memory. + */ +class run_cache_manager { + private: + enum class status : uint8_t { + DISABLED, ///< Run cache manager is disabled. No caching will be performed. If the special case + ///< happens, the run cache manager will be set to this status after the cache read + ///< is completed. This status also applies when the special case does not happen. + CAN_WRITE_TO_CACHE, ///< Run cache manager is ready for write. If the special case happens, the + ///< run cache manager will be set to this status. + CAN_READ_FROM_CACHE, ///< Run cache manager is ready for read. If the special case happens, the + ///< run cache manager will be set to this status after the cache write is + ///< completed. + }; + + public: + /** + * @brief Initialize the run cache manager. + * + * @param[in] s ORC decoder state. + */ + __device__ void initialize(orcdec_state_s* s) + { + _status = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP) + ? status::CAN_WRITE_TO_CACHE + : status::DISABLED; + _reusable_length = 0; + _run_length = 0; + } + + private: + status _status; ///< The status of the run cache manager. + uint32_t + _reusable_length; ///< The number of data to be cached and reused later. For example, if a run + ///< has a length of 512 but the maximum length allowed to be consumed is + ///< capped at 162, then 350 (512-162) data will be cached. + uint32_t _run_length; ///< The length of the run, 512 in the above example. + friend class cache_helper; +}; + +/** + * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for + * a row group. + * + * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is + * in the local storage (as an optimization). If a function is to use run_cache_manager, both the + * manager and the cache objects need to be passed. This class is introduced to simplify the + * function call, so that only a single cache_helper object needs to be passed. To that end, public + * methods originally belonging to run_cache_manager have been moved to this class. + */ +class cache_helper { + public: + /** + * @brief Constructor. + * + * @param[in] run_cache_manager_inst An instance of run_cache_manager. + */ + __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst) + : _manager(run_cache_manager_inst) + { + } + + /** + * @brief Set the reusable length object. + * + * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the + * DATA stream. + * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed + * by the decoder when processing the SECONDARY stream. + */ + __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length) + { + if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) { + _manager._run_length = run_length; + _manager._reusable_length = + (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0; + } + } + + /** + * @brief Adjust the maximum length allowed to be consumed when the length of the first run is + * greater than it. + * + * @param[in] max_length The maximum length allowed to be consumed for the DATA stream. + * @return A new maximum length. + */ + [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length) + { + auto new_max_length{max_length}; + if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) { + new_max_length -= _manager._reusable_length; + } + return new_max_length; + } + + /** + * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache. + * + * @param[in] src Intermediate buffer for the DATA stream. + */ + __device__ void write_to_cache(int64_t* src) + { + if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; } + + auto const tid = threadIdx.x; + + __syncthreads(); + + // All threads in the block always take a uniform code path for the following branches. + // _reusable_length ranges between [0, 512]. + if (_manager._reusable_length > 0) { + auto const length_to_skip = _manager._run_length - _manager._reusable_length; + if (tid < _manager._reusable_length) { + auto const src_idx = tid + length_to_skip; + _storage = src[src_idx]; + } + if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; } + } else { + if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; } + } + + __syncthreads(); + } + + /** + * @brief Copy the cached data to the intermediate buffer for the DATA stream. + * + * @param[in,out] dst Intermediate buffer for the DATA stream. + * @param[in,out] rle Run length decoder state object. + */ + __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle) + { + if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; } + + auto const tid = threadIdx.x; + + // First, shift the data up + auto const dst_idx = tid + _manager._reusable_length; + auto const v = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0; + __syncthreads(); + + if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; } + __syncthreads(); + + // Second, insert the cached data + if (tid < _manager._reusable_length) { dst[tid] = _storage; } + __syncthreads(); + + if (tid == 0) { + // Disable the run cache manager, since cache write-and-read happens at most once per row + // group. + _manager._status = run_cache_manager::status::DISABLED; + rle->num_vals += _manager._reusable_length; + } + + __syncthreads(); + } + + private: + run_cache_manager& _manager; ///< An instance of run_cache_manager. + int64_t _storage; ///< Per-thread cache storage. +}; + /** * @brief Initializes byte stream, modifying length and start position to keep the read pointer * 8-byte aligned. @@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { * @param[in] maxvals maximum number of values to decode * @param[in] t thread id * @param[in] has_buffered_values If true, means there are already buffered values + * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage + * caching of the first run of the DATA stream. * * @return number of values decoded */ @@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, T* vals, uint32_t maxvals, int t, - bool has_buffered_values = false) + bool has_buffered_values = false, + cache_helper* cache_helper_inst = nullptr) { if (t == 0) { + if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); } uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; auto numvals = 0; @@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, l += deltapos; } } + + if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); } + if ((numvals != 0) and (numvals + n > maxvals)) break; // case where there are buffered values and can't consume a whole chunk // from decoded values, so skip adding any more to buffer, work on buffered values and then @@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, __syncwarp(); } __syncthreads(); + // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the + // data type is int64_t. + if constexpr (cuda::std::is_same_v) { + if (cache_helper_inst != nullptr) { + // Run cache is read from during the 2nd iteration of the top-level while loop in + // gpuDecodeOrcColumnData(). + cache_helper_inst->read_from_cache(vals, rle); + // Run cache is written to during the 1st iteration of the loop. + cache_helper_inst->write_to_cache(vals); + } + } return rle->num_vals; } @@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Struct doesn't have any data in itself, so skip bool const is_valid = s->chunk.type_kind != STRUCT; size_t const max_num_rows = s->chunk.column_num_rows; + __shared__ run_cache_manager run_cache_manager_inst; + cache_helper cache_helper_inst(run_cache_manager_inst); if (t == 0 and is_valid) { // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { @@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); + + run_cache_manager_inst.initialize(s); } __syncthreads(); @@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (is_rlev1(s->chunk.encoding_kind)) { numvals = Integer_RLEv1(bs, &s->u.rlev1, s->vals.i64, numvals, t); } else { - numvals = Integer_RLEv2(bs, &s->u.rlev2, s->vals.i64, numvals, t); + numvals = Integer_RLEv2(bs, + &s->u.rlev2, + s->vals.i64, + numvals, + t, + false /**has_buffered_values */, + &cache_helper_inst); } if (s->chunk.type_kind == DECIMAL) { // If we're using an index, we may have to drop values from the initial run diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index b5f9b894c46..0d40a1f7b1b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 6b1a20701f9..77924ac0f35 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -719,7 +719,7 @@ std::vector construct_parquet_schema_tree( // all others default: CUDF_LOG_WARN( - "Unsupported page encoding requested: {}; the requested encoding will be ignored", + "Unsupported page encoding requested: %d; the requested encoding will be ignored", static_cast(col_meta.get_encoding())); return; } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index dfa5d46cf48..975206646c6 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ class file_sink : public data_sink { if (cufile_integration::is_kvikio_enabled()) { cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath, "w"); - CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", + CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode %s.", _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_out = detail::make_cufile_output(filepath); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 38dedcc2627..87b3c6facdf 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ class file_source : public datasource { if (cufile_integration::is_kvikio_enabled()) { cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath); - CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", + CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.", _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_in = detail::make_cufile_input(filepath); @@ -230,7 +230,7 @@ class memory_mapped_source : public file_source { { if (_map_addr != nullptr) { auto const result = munmap(_map_addr, _map_size); - if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); } _map_addr = nullptr; } } diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp index b9613428418..acfd2221797 100644 --- a/cpp/src/io/utilities/getenv_or.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,10 +32,17 @@ T getenv_or(std::string_view env_var_name, T default_val) { auto const env_val = std::getenv(env_var_name.data()); if (env_val != nullptr) { - CUDF_LOG_INFO("Environment variable {} read as {}", env_var_name, env_val); + CUDF_LOG_INFO("Environment variable %.*s read as %s", + static_cast(env_var_name.length()), + env_var_name.data(), + env_val); } else { - CUDF_LOG_INFO( - "Environment variable {} is not set, using default value {}", env_var_name, default_val); + std::stringstream ss; + ss << default_val; + CUDF_LOG_INFO("Environment variable %.*s is not set, using default value %s", + static_cast(env_var_name.length()), + env_var_name.data(), + ss.str()); } if (env_val == nullptr) { return default_val; } diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index ce4d2067b82..d1a01ee76e4 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,28 +47,19 @@ namespace cudf { namespace detail { namespace { -template -auto prepare_device_equal( - std::shared_ptr build, - std::shared_ptr probe, - bool has_nulls, - cudf::null_equality compare_nulls) -{ - auto const two_table_equal = - cudf::experimental::row::equality::two_table_comparator(probe, build); - return comparator_adapter{two_table_equal.equal_to( - nullate::DYNAMIC{has_nulls}, compare_nulls)}; -} +bool constexpr has_nulls = true; ///< Always has nulls /** * @brief Device functor to create a pair of {hash_value, row_index} for a given row. - * - * @tparam Hasher The type of internal hasher to compute row hash. */ -template +template class build_keys_fn { + using hasher = + cudf::experimental::row::hash::device_row_hasher; + public: - CUDF_HOST_DEVICE build_keys_fn(Hasher const& hash) : _hash{hash} {} + CUDF_HOST_DEVICE constexpr build_keys_fn(hasher const& hash) : _hash{hash} {} __device__ __forceinline__ auto operator()(size_type i) const noexcept { @@ -76,7 +67,7 @@ class build_keys_fn { } private: - Hasher _hash; + hasher _hash; }; /** @@ -92,26 +83,19 @@ struct output_fn { }; } // namespace -template -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - bool has_nulls, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _has_nulls{has_nulls}, +distinct_hash_join::distinct_hash_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) + : _has_nested_columns{cudf::has_nested_columns(build)}, _nulls_equal{compare_nulls}, _build{build}, - _probe{probe}, _preprocessed_build{ cudf::experimental::row::equality::preprocessed_table::create(_build, stream)}, - _preprocessed_probe{ - cudf::experimental::row::equality::preprocessed_table::create(_probe, stream)}, _hash_table{build.num_rows(), CUCO_DESIRED_LOAD_FACTOR, cuco::empty_key{cuco::pair{std::numeric_limits::max(), rhs_index_type{JoinNoneValue}}}, - prepare_device_equal( - _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls), + always_not_equal{}, {}, cuco::thread_scope_device, cuco_storage_type{}, @@ -124,10 +108,10 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build, if (this->_build.num_rows() == 0) { return; } auto const row_hasher = experimental::row::hash::row_hasher{this->_preprocessed_build}; - auto const d_hasher = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); + auto const d_hasher = row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_hasher}); + auto const iter = + cudf::detail::make_counting_transform_iterator(0, build_keys_fn{d_hasher}); size_type const build_table_num_rows{build.num_rows()}; if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) { @@ -146,15 +130,15 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build, } } -template std::pair>, std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { cudf::scoped_range range{"distinct_hash_join::inner_join"}; - size_type const probe_table_num_rows{this->_probe.num_rows()}; + size_type const probe_table_num_rows{probe.num_rows()}; // If output size is zero, return immediately if (probe_table_num_rows == 0) { @@ -162,25 +146,62 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, std::make_unique>(0, stream, mr)); } + auto preprocessed_probe = + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( + preprocessed_probe, _preprocessed_build); + auto build_indices = std::make_unique>(probe_table_num_rows, stream, mr); auto probe_indices = std::make_unique>(probe_table_num_rows, stream, mr); - auto const probe_row_hasher = - cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe}; - auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_probe_hasher}); + auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; + auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); + auto const iter = cudf::detail::make_counting_transform_iterator( + 0, build_keys_fn{d_probe_hasher}); auto found_indices = rmm::device_uvector(probe_table_num_rows, stream); auto const found_begin = thrust::make_transform_output_iterator(found_indices.begin(), output_fn{}); - // TODO conditional find for nulls once `cuco::static_set::find_if` is added - // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal - // to `JoinNoneValue`, then `idx` has a match in the hash set. - this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value()); + auto const comparator_helper = [&](auto device_comparator) { + // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not + // equal to `JoinNoneValue`, then `idx` has a match in the hash set. + if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) { + this->_hash_table.find_async(iter, + iter + probe_table_num_rows, + comparator_adapter{device_comparator}, + hasher{}, + found_begin, + stream.value()); + } else { + auto stencil = thrust::counting_iterator{0}; + auto const row_bitmask = + cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first; + auto const pred = + cudf::detail::row_is_valid{reinterpret_cast(row_bitmask.data())}; + + this->_hash_table.find_if_async(iter, + iter + probe_table_num_rows, + stencil, + pred, + comparator_adapter{device_comparator}, + hasher{}, + found_begin, + stream.value()); + } + }; + + if (_has_nested_columns) { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } else { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } auto const tuple_iter = cudf::detail::make_counting_transform_iterator( 0, @@ -203,16 +224,17 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, build_indices->resize(actual_size, stream); probe_indices->resize(actual_size, stream); - return {std::move(build_indices), std::move(probe_indices)}; + return {std::move(probe_indices), std::move(build_indices)}; } -template -std::unique_ptr> distinct_hash_join::left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const +std::unique_ptr> distinct_hash_join::left_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { cudf::scoped_range range{"distinct_hash_join::left_join"}; - size_type const probe_table_num_rows{this->_probe.num_rows()}; + size_type const probe_table_num_rows{probe.num_rows()}; // If output size is zero, return empty if (probe_table_num_rows == 0) { @@ -227,80 +249,82 @@ std::unique_ptr> distinct_hash_join::l thrust::fill( rmm::exec_policy_nosync(stream), build_indices->begin(), build_indices->end(), JoinNoneValue); } else { - auto const probe_row_hasher = - cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe}; - auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_probe_hasher}); + auto preprocessed_probe = + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( + preprocessed_probe, _preprocessed_build); + + auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; + auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); + auto const iter = cudf::detail::make_counting_transform_iterator( + 0, build_keys_fn{d_probe_hasher}); auto const output_begin = thrust::make_transform_output_iterator(build_indices->begin(), output_fn{}); - // TODO conditional find for nulls once `cuco::static_set::find_if` is added - this->_hash_table.find_async(iter, iter + probe_table_num_rows, output_begin, stream.value()); + auto const comparator_helper = [&](auto device_comparator) { + if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) { + this->_hash_table.find_async(iter, + iter + probe_table_num_rows, + comparator_adapter{device_comparator}, + hasher{}, + output_begin, + stream.value()); + } else { + auto stencil = thrust::counting_iterator{0}; + auto const row_bitmask = + cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first; + auto const pred = + cudf::detail::row_is_valid{reinterpret_cast(row_bitmask.data())}; + + this->_hash_table.find_if_async(iter, + iter + probe_table_num_rows, + stencil, + pred, + comparator_adapter{device_comparator}, + hasher{}, + output_begin, + stream.value()); + } + }; + + if (_has_nested_columns) { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } else { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } } return build_indices; } } // namespace detail -template <> -distinct_hash_join::~distinct_hash_join() = default; - -template <> -distinct_hash_join::~distinct_hash_join() = default; - -template <> -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls, - null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _impl{std::make_unique( - build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)} -{ -} - -template <> -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls, - null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _impl{std::make_unique( - build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)} -{ -} +distinct_hash_join::~distinct_hash_join() = default; -template <> -std::pair>, - std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::distinct_hash_join(cudf::table_view const& build, + null_equality compare_nulls, + rmm::cuda_stream_view stream) + : _impl{std::make_unique(build, compare_nulls, stream)} { - return _impl->inner_join(stream, mr); } -template <> std::pair>, std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const -{ - return _impl->inner_join(stream, mr); -} - -template <> -std::unique_ptr> -distinct_hash_join::left_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - return _impl->left_join(stream, mr); + return _impl->inner_join(probe, stream, mr); } -template <> -std::unique_ptr> distinct_hash_join::left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const +std::unique_ptr> distinct_hash_join::left_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - return _impl->left_join(stream, mr); + return _impl->left_join(probe, stream, mr); } } // namespace cudf diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 4f75908fe72..37c5698f654 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index b04e9961e01..b5063931485 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str, if (str_length == 0) return tgt_length; if (tgt_length == 0) return str_length; - auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); - auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); - // .first is min and .second is max - auto const [n, m] = std::minmax(str_length, tgt_length); + auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); + auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); + auto const n = cuda::std::min(str_length, tgt_length); + auto const m = cuda::std::max(str_length, tgt_length); // setup compute buffer pointers auto v0 = buffer; auto v1 = v0 + n + 1; @@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str, auto sub_cost = v0[j] + (*itr != *itr_tgt); auto del_cost = v0[j + 1] + 1; auto ins_cost = v1[j] + 1; - v1[j + 1] = std::min(std::min(sub_cost, del_cost), ins_cost); + v1[j + 1] = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost); } thrust::swap(v0, v1); } @@ -170,7 +171,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str ? d_targets.element(0) : d_targets.element(idx); // just need 2 integers for each character of the shorter string - return (std::min(d_str.length(), d_tgt.length()) + 1) * 2; + return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2; }); // get the total size of the temporary compute buffer @@ -241,7 +242,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con if (d_str1.empty() || d_str2.empty()) { return; } // the temp size needed is 2 integers per character of the shorter string d_offsets[idx - ((row + 1) * (row + 2)) / 2] = - (std::min(d_str1.length(), d_str2.length()) + 1) * 2; + (cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2; }); // get the total size for the compute buffer diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 2de94a4eb59..247440212d0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string } } auto const char_count = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); } + if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); } } /** diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 9a44d9477ab..9ce17c36b1f 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,14 +40,13 @@ #include #include +#include #include #include #include #include #include -#include - namespace nvtext { namespace detail { namespace { @@ -156,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, // initialize the output -- only needed for wider strings auto d_output = d_results + (str_idx * param_count); for (auto i = lane_idx; i < param_count; i += tile_size) { - d_output[i] = std::numeric_limits::max(); + d_output[i] = cuda::std::numeric_limits::max(); } } } @@ -226,7 +225,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, ? section_size : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); - auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const init = size_bytes == 0 ? 0 : cuda::std::numeric_limits::max(); auto const lane_idx = block.thread_rank(); auto const d_output = d_results + (str_idx * parameter_a.size()); @@ -235,7 +234,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, // constants used in the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; - constexpr hash_value_type hash_max = std::numeric_limits::max(); + constexpr hash_value_type hash_max = cuda::std::numeric_limits::max(); // found to be an efficient shared memory size for both hash types __shared__ hash_value_type block_values[block_size * params_per_thread]; diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 943bcbe9b3a..b041ce3ce0a 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -196,7 +197,7 @@ struct sub_offset_fn { { // keep delimiter search within this sub-block auto const end = - d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset); + d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset); // starting point of this sub-block auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE); while ((itr < end) && diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index a3bed45e4bd..7a39199011e 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings, constexpr uint8_t max_utf8_blocks_for_char = 4; uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0}; - for (int i = 0; i < std::min(static_cast(max_utf8_blocks_for_char), - total_bytes - start_byte_for_thread); + for (int i = 0; i < cuda::std::min(static_cast(max_utf8_blocks_for_char), + total_bytes - start_byte_for_thread); ++i) { utf8_blocks[i] = strings[start_byte_for_thread + i]; } diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index dd1e8ddb027..19f144dd158 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ #include #include +#include +#include #include #include #include @@ -87,7 +89,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi // Deal with the start_word_indices array if (char_for_thread < num_code_points) { - uint32_t val_to_write = std::numeric_limits::max(); + uint32_t val_to_write = cuda::std::numeric_limits::max(); if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) && (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) { val_to_write = char_for_thread; @@ -95,7 +97,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi start_word_indices[char_for_thread] = val_to_write; // Deal with the end_word_indices_array - val_to_write = std::numeric_limits::max(); + val_to_write = cuda::std::numeric_limits::max(); if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread + 1 < num_code_points) && (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) { @@ -103,7 +105,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi } end_word_indices[char_for_thread] = val_to_write; - token_ids[char_for_thread] = std::numeric_limits::max(); + token_ids[char_for_thread] = cuda::std::numeric_limits::max(); tokens_per_word[char_for_thread] = 0; } } @@ -214,7 +216,7 @@ struct mark_special_tokens { __device__ void operator()(size_t idx) const { uint32_t const start_index = start_word_indices[idx]; - if ((start_index == std::numeric_limits::max()) || + if ((start_index == cuda::std::numeric_limits::max()) || ((start_index + MIN_ST_WIDTH + 2) > num_code_points)) return; if (code_points[start_index] != '[') return; @@ -225,12 +227,12 @@ struct mark_special_tokens { uint32_t const end_index = [&] { auto const begin = start_word_indices + start_pos; auto const width = - std::min(static_cast(MAX_ST_WIDTH + 1), (num_code_points - start_pos)); + cuda::std::min(static_cast(MAX_ST_WIDTH + 1), (num_code_points - start_pos)); auto const end = begin + width; // checking the next start-word is more reliable than arbitrarily searching for ']' // in case the text is split across string rows auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) { - return swi != std::numeric_limits::max(); + return swi != cuda::std::numeric_limits::max(); }); return iter == end ? start_index : static_cast(iter - start_word_indices); }(); @@ -254,11 +256,11 @@ struct mark_special_tokens { thrust::fill(thrust::seq, start_word_indices + start_index + 1, // keep the first one start_word_indices + end_index + 1, - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); thrust::fill(thrust::seq, end_word_indices + start_index, end_word_indices + end_index + 1, - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); // reset the new end-word index end_word_indices[end_pos] = end_pos + 1; @@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, // We need to clean up the global array. This case is very uncommon. // Only 0.016% of words cannot be resolved to a token from the squad dev set. for (uint32_t i = 1; i < num_values_tokenized; ++i) { - token_ids[token_start + i] = std::numeric_limits::max(); + token_ids[token_start + i] = cuda::std::numeric_limits::max(); } num_values_tokenized = 0; } @@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu } struct copy_if_fn { // inline lambda not allowed in private or protected member function - __device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits::max(); } + __device__ bool operator()(uint32_t cp) + { + return cp != cuda::std::numeric_limits::max(); + } }; struct tranform_fn { // just converting uint8 value to uint32 @@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre auto itr_end = thrust::remove(rmm::exec_policy(stream), device_word_indices.begin(), device_word_indices.end(), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); // The number of tokens selected will be double the number of words since we // select from both the start and end index arrays. @@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre // token so this will always have enough memory to store the contiguous tokens. uint32_t* contiguous_token_ids = device_code_points; auto const copy_size = // thrust::copy_if limited to copying int-max values - std::min(device_token_ids.size(), static_cast(std::numeric_limits::max())); + cuda::std::min(device_token_ids.size(), + static_cast(cuda::std::numeric_limits::max())); auto ids_itr = device_token_ids.begin(); auto const ids_end = device_token_ids.end(); while (ids_itr != ids_end) { diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 4196523d211..73c4567d3a4 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ class fixed_pinned_pool_memory_resource { pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)} { - CUDF_LOG_INFO("Pinned pool size = {}", pool_size_); + CUDF_LOG_INFO("Pinned pool size = %zu", pool_size_); // Allocate full size from the pinned pool to figure out the beginning and end address pool_begin_ = pool_->allocate_async(pool_size_, stream_); diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index b0f2d8c0637..80364885980 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,7 +129,8 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { std::vector get_streams(std::size_t count) override { if (count > STREAM_POOL_SIZE) { - CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE); + CUDF_LOG_WARN( + "get_streams called with count (%zu) > pool size (%zu)", count, STREAM_POOL_SIZE); } auto streams = std::vector(); for (uint32_t i = 0; i < count; i++) { diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e5c29314203..344979e1288 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -192,6 +192,7 @@ ConfigureTest( hashing/sha256_test.cpp hashing/sha384_test.cpp hashing/sha512_test.cpp + hashing/xxhash_32_test.cpp hashing/xxhash_64_test.cpp ) diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp new file mode 100644 index 00000000000..9e3c66b0d0b --- /dev/null +++ b/cpp/tests/hashing/xxhash_32_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +class XXHash_32_Test : public cudf::test::BaseFixture {}; + +TEST_F(XXHash_32_Test, TestInteger) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{0, 42, 825}}; + auto constexpr seed = 0u; + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({148298089u, 1161967057u, 1066694813u}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, TestDouble) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{-8., 25., 90.}}; + auto constexpr seed = 42u; + + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({2276435783u, 3120212431u, 3454197470u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, StringType) +{ + auto col1 = cudf::test::strings_column_wrapper({"I", "am", "AI"}); + auto constexpr seed = 825u; + + auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({320624298u, 1612654309u, 1409499009u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 9070efa38fe..e1ec8cda3ac 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ struct DistinctJoinTest : public cudf::test::BaseFixture { cudf::table_view const& expected_table, cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK) { - auto const& [build_join_indices, probe_join_indices] = result; + auto const& [probe_join_indices, build_join_indices] = result; auto build_indices_span = cudf::device_span{*build_join_indices}; auto probe_indices_span = cudf::device_span{*probe_join_indices}; @@ -89,10 +89,9 @@ TEST_F(DistinctJoinTest, IntegerInnerJoin) auto build_table = cudf::table_view{{build->view()}}; auto probe_table = cudf::table_view{{probe->view()}}; - auto distinct_join = cudf::distinct_hash_join{ - build_table, probe_table, cudf::nullable_join::NO}; + auto distinct_join = cudf::distinct_hash_join{build_table}; - auto result = distinct_join.inner_join(); + auto result = distinct_join.inner_join(probe_table); auto constexpr gold_size = size / 2; auto gold = cudf::sequence(gold_size, init, cudf::numeric_scalar{2}); @@ -120,8 +119,8 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{1, 2}}; strcol_wrapper col_gold_1({"s0", "s0"}); @@ -162,8 +161,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{3, 2}}; strcol_wrapper col_gold_1({"s1", "s0"}, {true, true}); @@ -229,8 +228,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{3, 2}}; strcol_wrapper col_gold_1({"s1", "s0"}, {true, true}); @@ -284,8 +283,8 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); this->compare_to_reference(build.view(), probe.view(), result, build.view()); } @@ -307,9 +306,9 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -332,8 +331,8 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); this->compare_to_reference(build.view(), probe.view(), result, probe.view()); } @@ -355,9 +354,9 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -391,9 +390,9 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) cols_gold.push_back(col_gold_3.release()); Table gold(std::move(cols_gold)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -416,9 +415,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -461,9 +460,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Ãœberwald"}; diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index 58396115a54..b5d20325b75 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel) cudf::default_logger().warn("warn"); cudf::default_logger().error("error"); cudf::default_logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); + ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 5024747227e..222b698a78d 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -63,11 +63,11 @@ keyword arguments, cuDF is not able to provide GPU acceleration and `cudf.pandas` will fall back to the CPU. The most accurate way to assess which functions run on the GPU is to try -running the code while using the `cudf.pandas` profiling features. The -profiler will indicate which functions ran on GPU / CPU. To improve -performance, try to use only functionality that can run entirely on GPU. -This helps reduce the number of memory transfers needed to fallback to -CPU. +running the code while using the `cudf.pandas` [profiling +features](cudf-pandas-profiling). The profiler will indicate which functions +ran on GPU / CPU. To improve performance, try to use only functionality that +can run entirely on GPU. This helps reduce the number of memory transfers +needed to fallback to CPU. ## How can I improve performance of my workflow with `cudf.pandas`? diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 089f283e25d..fed63c2dd0f 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -75,6 +75,7 @@ with Pool(4) as pool: ... ``` +(cudf-pandas-profiling)= ## Profiling `cudf.pandas` `cudf.pandas` will attempt to use the GPU whenever possible and fall diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index 53af52eff07..5e544e92a77 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f * @param filePath Full path of the input Parquet file to read. */ public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) { - handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - + long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, HostMemoryBuffer buffer, long offset, long len) { - handle = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, - buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; + if (handle == 0) { + throw new IllegalStateException("Cannot create native chunked Parquet reader object."); + } + multiHostBufferSourceHandle = handles[1]; + } + /** + * Construct the reader instance from a read limit and data in host memory buffers. + * + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or + * 0 if there is no limit + * @param opts The options for Parquet reading. + * @param buffers Array of buffers containing the file data. The buffers are logically + * concatenated to construct the file being read. + */ + public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, + ParquetOptions opts, HostMemoryBuffer... buffers) { + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -181,6 +211,10 @@ public void close() { DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); dataSourceHandle = 0; } + if (multiHostBufferSourceHandle != 0) { + destroyMultiHostBufferSource(multiHostBufferSourceHandle); + multiHostBufferSourceHandle = 0; + } } @@ -196,6 +230,8 @@ public void close() { private long dataSourceHandle = 0; + private long multiHostBufferSourceHandle = 0; + /** * Create a native chunked Parquet reader object on heap and return its memory address. * @@ -206,13 +242,12 @@ public void close() { * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all. * @param binaryToString Whether to convert the corresponding column to String if it is binary. * @param filePath Full path of the file to read, or given as null if reading from a buffer. - * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer. - * @param length The length of the buffer to read from. + * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers. * @param timeUnit Return type of time unit for timestamps. */ - private static native long create(long chunkSizeByteLimit, long passReadLimit, - String[] filterColumnNames, boolean[] binaryToString, - String filePath, long bufferAddrs, long length, int timeUnit); + private static native long[] create(long chunkSizeByteLimit, long passReadLimit, + String[] filterColumnNames, boolean[] binaryToString, + String filePath, long[] bufferAddrsSizes, int timeUnit); private static native long createWithDataSource(long chunkedSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); @@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit, private static native long[] readChunk(long handle); private static native void close(long handle); + + private static native void destroyMultiHostBufferSource(long handle); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b01ce31b1f3..298f2cff6f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length, * all of them * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. - * @param address the address of the buffer to read from or 0 if we should not. - * @param length the length of the buffer to read from. + * @param addrsAndSizes the address and size pairs for every buffer or null for no buffers. * @param timeUnit return type of TimeStamp in units */ private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, - long address, long length, int timeUnit) throws CudfException; + long[] addrsAndSizes, int timeUnit) throws CudfException; private static native long[] readParquetFromDataSource(String[] filterColumnNames, boolean[] binaryToString, int timeUnit, @@ -1357,7 +1356,7 @@ public static Table readParquet(File path) { */ public static Table readParquet(ParquetOptions opts, File path) { return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); + path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId())); } /** @@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, } } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffer raw parquet formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) { return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } @@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); + } + + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated + * in order to construct the file being read. + * @return the data parsed as a table on the GPU. + */ + public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) { + assert buffers.length > 0; + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param ds custom datasource to provide the Parquet file data + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, DataSource ds) { long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); try { diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9ff43feeac6..bd1714aa476 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -156,8 +156,9 @@ add_library( src/ScalarJni.cpp src/TableJni.cpp src/aggregation128_utils.cu - src/maps_column_view.cu src/check_nvcomp_output_sizes.cu + src/maps_column_view.cu + src/multi_host_buffer_source.cpp ) # Disable NVTX if necessary diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp new file mode 100644 index 00000000000..2aedb2321e4 --- /dev/null +++ b/java/src/main/native/include/multi_host_buffer_source.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "jni_utils.hpp" + +#include + +#include + +namespace cudf { +namespace jni { + +/** + * @brief A custom datasource providing data from an array of host memory buffers. + */ +class multi_host_buffer_source : public cudf::io::datasource { + std::vector addrs_; + std::vector offsets_; + + size_t locate_offset_index(size_t offset); + + public: + explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes); + std::unique_ptr host_read(size_t offset, size_t size) override; + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; + bool supports_device_read() const override { return true; } + bool is_device_read_preferred(size_t size) const override { return true; } + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override; + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + size_t size() const override { return offsets_.back(); } +}; + +} // namespace jni +} // namespace cudf diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index cf04a87262f..4967e0b2b04 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -36,7 +37,7 @@ extern "C" { // This function should take all the parameters that `Table.readParquet` takes, // plus one more parameter `long chunkSizeByteLimit`. -JNIEXPORT jlong JNICALL +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jclass, jlong chunk_read_limit, @@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path, - jlong buffer, - jlong buffer_length, + jlongArray addrs_sizes, jint unit) { - JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr); bool read_buffer = true; - if (buffer == 0) { - JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0); + if (addrs_sizes == nullptr) { + JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr); read_buffer = false; } else if (inp_file_path != nullptr) { - JNI_THROW_NEW( - env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0); + JNI_THROW_NEW(env, + cudf::jni::ILLEGAL_ARG_CLASS, + "Cannot pass in both buffers and an inp_file_path", + nullptr); } try { cudf::jni::auto_set_device(env); cudf::jni::native_jstring filename(env, inp_file_path); if (!read_buffer && filename.is_empty()) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0); + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr); } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); @@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); (void)n_col_binary_read; - auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto opts_builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, auto const read_opts = opts_builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - - return reinterpret_cast( + n_addrs_sizes.cancel(); + n_col_binary_read.cancel(); + auto reader_handle = reinterpret_cast( new cudf::io::chunked_parquet_reader(static_cast(chunk_read_limit), static_cast(pass_read_limit), read_opts)); + cudf::jni::native_jlongArray result(env, 2); + result[0] = reader_handle; + result[1] = cudf::jni::release_as_jlong(multi_buffer_source); + return result.get_jArray(); } - CATCH_STD(env, 0); + CATCH_STD(env, nullptr); } JNIEXPORT jlong JNICALL @@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en CATCH_STD(env, ); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource( + JNIEnv* env, jclass, jlong handle) +{ + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + // // Chunked ORC reader JNI // diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1f8b1ea207d..a6c7ae9ba18 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" #include "jni_writer_data_sink.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, - jlong buffer, - jlong buffer_length, + jlongArray addrs_and_sizes, jint unit) { JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; - if (buffer == 0) { + if (addrs_and_sizes == nullptr) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); read_buffer = false; } else if (inputfilepath != NULL) { JNI_THROW_NEW( env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL); } try { @@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); - - auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + auto tbl = cudf::io::read_parquet(opts).tbl; + n_col_binary_read.cancel(); + n_addrs_sizes.cancel(); + return convert_table_for_return(env, tbl); } CATCH_STD(env, NULL); } @@ -2901,16 +2907,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap j_right_keys, compare_nulls_equal, [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - if (cudf::has_nested_columns(right)) { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - return hash.left_join(); - } else { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - return hash.left_join(); - } + cudf::distinct_hash_join hash(right, nulleq); + return hash.left_join(left); }); } @@ -3119,22 +3117,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa j_right_keys, compare_nulls_equal, [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - std::pair>, - std::unique_ptr>> - maps; - if (cudf::has_nested_columns(right)) { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - maps = hash.inner_join(); - } else { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - maps = hash.inner_join(); - } - // Unique join returns {right map, left map} but all the other joins - // return {left map, right map}. Swap here to make it consistent. - return std::make_pair(std::move(maps.second), std::move(maps.first)); + cudf::distinct_hash_join hash(right, nulleq); + return hash.inner_join(left); }); } diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp new file mode 100644 index 00000000000..c577fc680ba --- /dev/null +++ b/java/src/main/native/src/multi_host_buffer_source.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "multi_host_buffer_source.hpp" + +#include +#include +#include +#include + +namespace cudf { +namespace jni { + +multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes) +{ + if (addrs_sizes.size() % 2 != 0) { + throw std::logic_error("addrs_sizes length not a multiple of 2"); + } + auto count = addrs_sizes.size() / 2; + addrs_.reserve(count); + offsets_.reserve(count + 1); + size_t total_size = 0; + for (int i = 0; i < addrs_sizes.size(); i += 2) { + addrs_.push_back(reinterpret_cast(addrs_sizes[i])); + offsets_.push_back(total_size); + total_size += addrs_sizes[i + 1]; + } + offsets_.push_back(total_size); +} + +size_t multi_host_buffer_source::locate_offset_index(size_t offset) +{ + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto start = offsets_.begin(); + auto it = std::upper_bound(start, offsets_.end(), offset); + return (it - start) - 1; +} + +std::unique_ptr multi_host_buffer_source::host_read(size_t offset, + size_t size) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto const end_offset = offset + size; + if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto next_offset = offsets_[buffer_index + 1]; + if (end_offset <= next_offset) { + // read range hits only a single buffer, so return a zero-copy view of the data + auto src = addrs_[buffer_index] + offset - offsets_[buffer_index]; + return std::make_unique(src, size); + } + auto buf = std::vector(size); + auto bytes_read = host_read(offset, size, buf.data()); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected host read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>>(std::move(buf)); +} + +size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + std::memcpy(dst, src, copy_size); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::unique_ptr multi_host_buffer_source::device_read( + size_t offset, size_t size, rmm::cuda_stream_view stream) +{ + rmm::device_buffer buf(size, stream); + auto dst = static_cast(buf.data()); + auto bytes_read = device_read(offset, size, dst, stream); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected device read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>(std::move(buf)); +} + +size_t multi_host_buffer_source::device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value())); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::future multi_host_buffer_source::device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + std::promise p; + p.set_value(device_read(offset, size, dst, stream)); + return p.get_future(); +} + +} // namespace jni +} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index c7fcb1756b6..7eb32892bad 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,11 @@ import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.StandardOpenOption; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -1714,6 +1717,42 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetHostBuffers() throws Exception { + long size = TEST_PARQUET_FILE_CHUNKED_READ.length(); + java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath(); + try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2); + HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) { + try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) { + ByteBuffer bb1 = buf1.asByteBuffer(); + while (bb1.hasRemaining()) { + if (channel.read(bb1) == -1) { + throw new EOFException("error reading first buffer"); + } + } + ByteBuffer bb2 = buf2.asByteBuffer(); + while (bb2.hasRemaining()) { + if (channel.read(bb2) == -1) { + throw new EOFException("error reading second buffer"); + } + } + } + ParquetOptions opts = ParquetOptions.DEFAULT; + try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + } + @Test void testChunkedReadParquetFromDataSource() throws IOException { try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 40bd50acf16..fd6d0257940 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -260,26 +260,3 @@ cdef class DeviceScalar: self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (cdtype_id) ] - - -def as_device_scalar(val, dtype=None): - if isinstance(val, (cudf.Scalar, DeviceScalar)): - if dtype == val.dtype or dtype is None: - if isinstance(val, DeviceScalar): - return val - else: - return val.device_value - else: - raise TypeError("Can't update dtype of existing GPU scalar") - else: - return cudf.Scalar(val, dtype=dtype).device_value - - -def _is_null_host_scalar(slr): - if cudf.utils.utils.is_na_like(slr): - return True - elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - slr is pd.NaT: - return True - else: - return False diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b10b8dfe207..d705b4d4c21 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -621,7 +621,7 @@ def ordered(self) -> bool: def __setitem__(self, key, value): if cudf.api.types.is_scalar( value - ) and cudf._lib.scalar._is_null_host_scalar(value): + ) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31efe267c96..24b657f1c32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.scalar import as_device_scalar from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -71,7 +70,7 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import _array_ufunc, mask_dtype +from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype if TYPE_CHECKING: import builtins @@ -777,9 +776,7 @@ def fillna( if not self.has_nulls(include_nan=True): return self.copy() elif method is None: - if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( - fill_value - ): + if is_scalar(fill_value) and _is_null_host_scalar(fill_value): return self.copy() else: fill_value = self._validate_fillna_value(fill_value) @@ -1984,12 +1981,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - as_device_scalar( + cudf.Scalar( arbitrary.start, dtype=np.dtype(np.int64) - ).c_value, - as_device_scalar( + ).device_value.c_value, + cudf.Scalar( arbitrary.step, dtype=np.dtype(np.int64) - ).c_value, + ).device_value.c_value, ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d9440cdf21..6283e498842 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -236,7 +236,7 @@ def from_sequences( # Build Data, Mask & Offsets for data in arbitrary: - if cudf._lib.scalar._is_null_host_scalar(data): + if cudf.utils.utils._is_null_host_scalar(data): mask_col.append(False) offset_vals.append(offset) else: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4405e153b0c..8fe5299fcdd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -151,7 +151,7 @@ def __setitem__(self, key: Any, value: Any): cudf.Scalar( value, dtype=self.dtype - if cudf._lib.scalar._is_null_host_scalar(value) + if cudf.utils.utils._is_null_host_scalar(value) else None, ) if is_scalar(value) @@ -789,7 +789,7 @@ def _normalize_find_and_replace_input( ) # Scalar case if len(col_to_normalize) == 1: - if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): + if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]): return normalized_column.astype(input_column_dtype) if np.isinf(col_to_normalize[0]): return normalized_column diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 417fa99dac0..749ab8e837a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime import functools +import math from typing import TYPE_CHECKING, cast import numpy as np @@ -263,7 +264,15 @@ def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] def total_seconds(self) -> ColumnBase: - raise NotImplementedError("total_seconds is currently not implemented") + conversion = _unit_to_nanoseconds_conversion[self.time_unit] / 1e9 + # Typecast to decimal128 to avoid floating point precision issues + # https://github.com/rapidsai/cudf/issues/17664 + return ( + (self.astype("int64") * conversion) + .astype(cudf.Decimal128Dtype(38, 9)) + .round(decimals=abs(int(math.log10(conversion)))) + .astype("float64") + ) def ceil(self, freq: str) -> ColumnBase: raise NotImplementedError("ceil is currently not implemented") diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3334b57ce1b..b2121511a14 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -92,7 +92,11 @@ min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +from cudf.utils.utils import ( + GetAttrGetItemMixin, + _external_only_api, + _is_null_host_scalar, +) if TYPE_CHECKING: from cudf._typing import ColumnLike, Dtype, NotImplementedType @@ -3371,7 +3375,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if isinstance(value, (np.ndarray, cupy.ndarray)): dtype = value.dtype value = value.item() - if libcudf.scalar._is_null_host_scalar(value): + if _is_null_host_scalar(value): dtype = "str" value = as_column( value, diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4137109cc96..6ae524d6346 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import copy @@ -49,7 +49,7 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from collections.abc import Generator, Iterable + from collections.abc import Generator, Hashable, Iterable from cudf._typing import ( AggType, @@ -2448,7 +2448,7 @@ def _cov_or_corr(self, func, method_name): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be used in the correlation or covariance # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values._column_names + column_names = self.grouping._values_column_names num_cols = len(column_names) column_pair_structs = {} @@ -2682,10 +2682,8 @@ def diff(self, periods=1, axis=0): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values - self.shift(periods=periods) def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: @@ -2789,9 +2787,8 @@ def fillna( raise ValueError("Method can only be of 'ffill', 'bfill'.") return getattr(self, method, limit)() - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) @@ -3543,6 +3540,13 @@ def keys(self): self._key_columns[0], name=self.names[0] ) + @property + def _values_column_names(self) -> list[Hashable]: + # If the key columns are in `obj`, filter them out + return [ + x for x in self._obj._column_names if x not in self._named_columns + ] + @property def values(self) -> cudf.core.frame.Frame: """Return value columns as a frame. @@ -3553,11 +3557,9 @@ def values(self) -> cudf.core.frame.Frame: This is mainly used in transform-like operations. """ - # If the key columns are in `obj`, filter them out - value_column_names = [ - x for x in self._obj._column_names if x not in self._named_columns - ] - value_columns = self._obj._data.select_by_label(value_column_names) + value_columns = self._obj._data.select_by_label( + self._values_column_names + ) return self._obj.__class__._from_data(value_columns) def _handle_callable(self, by): diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eac5b9d71ae..85be8d21d27 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -842,14 +842,14 @@ def sort_values( @_performance_tracking def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return cudf.Index._from_column( + return Index._from_column( self._column.take(gather_map, nullify, check_bounds), name=self.name, ) @_performance_tracking def _apply_boolean_mask(self, boolean_mask): - return cudf.Index._from_column( + return Index._from_column( self._column.apply_boolean_mask(boolean_mask), name=self.name ) @@ -857,7 +857,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_column( + return Index._from_column( self._as_int_index()._split(splits), name=self.name ) @@ -1657,7 +1657,7 @@ def _clean_nulls_from_index(self) -> Index: if isinstance(self, (DatetimeIndex, TimedeltaIndex)) else str(cudf.NA) ) - return cudf.Index._from_column( + return Index._from_column( self._column.astype("str").fillna(fill_value), name=self.name, ) @@ -2964,13 +2964,13 @@ def median(self, *, skipna: bool = True, axis: int | None = 0): def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1): return self._column.std(skipna=skipna, ddof=ddof) - def total_seconds(self) -> cupy.ndarray: + def total_seconds(self) -> Index: """ Return total duration of each element expressed in seconds. This method is currently not implemented. """ - return self._column.total_seconds().values + return Index._from_column(self._column.total_seconds(), name=self.name) def ceil(self, freq: str) -> Self: """ diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6854cb02aa5..e9ed74f804b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -2836,16 +2836,22 @@ def hash_values( Parameters ---------- - method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' + method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3' Hash function to use: * murmur3: MurmurHash3 hash function - * md5: MD5 hash function + * xxhash32: xxHash32 hash function * xxhash64: xxHash64 hash function + * md5: MD5 hash function + * sha1: SHA-1 hash function + * sha224: SHA-224 hash function + * sha256: SHA-256 hash function + * sha384: SHA-384 hash function + * sha512: SHA-512 hash function seed : int, optional Seed value to use for the hash function. This parameter is only - supported for 'murmur3' and 'xxhash64'. + supported for 'murmur3', 'xxhash32', and 'xxhash64'. Returns @@ -2900,7 +2906,7 @@ def hash_values( 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ - seed_hash_methods = {"murmur3", "xxhash64"} + seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"} if seed is None: seed = 0 elif method not in seed_hash_methods: @@ -2914,6 +2920,8 @@ def hash_values( ) if method == "murmur3": plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash32": + plc_column = plc.hashing.xxhash_32(plc_table, seed) elif method == "xxhash64": plc_column = plc.hashing.xxhash_64(plc_table, seed) elif method == "md5": diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 80dd0921f9c..7d246960cc9 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -178,13 +178,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not cudf._lib.scalar._is_null_host_scalar(self._host_value) + return not cudf.utils.utils._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not cudf._lib.scalar._is_null_host_scalar(value) + valid = not cudf.utils.utils._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 961e5e11bc0..49c2c8cf387 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -5183,6 +5183,66 @@ def components(self) -> cudf.DataFrame: ca, index=self.series.index ) + def total_seconds(self) -> Series: + """ + Return total duration of each element expressed in seconds. + + This method is available directly on TimedeltaIndex + and on Series containing timedelta values under the ``.dt`` namespace. + + Returns + ------- + Index or Series + When the calling object is a TimedeltaIndex, + the return type is an Index with a float64 dtype. When the calling object + is a Series, the return type is Series of type `float64` whose + index is the same as the original. + + See Also + -------- + datetime.timedelta.total_seconds : Standard library version + of this method. + TimedeltaIndex.components : Return a DataFrame with components of + each Timedelta. + + Examples + -------- + **Series** + + >>> import cudf + >>> import pandas as pd + >>> import numpy as np + >>> s = cudf.Series(pd.to_timedelta(np.arange(5), unit="D")) + >>> s + 0 0 days 00:00:00 + 1 1 days 00:00:00 + 2 2 days 00:00:00 + 3 3 days 00:00:00 + 4 4 days 00:00:00 + dtype: timedelta64[ns] + + >>> s.dt.total_seconds() + 0 0.0 + 1 86400.0 + 2 172800.0 + 3 259200.0 + 4 345600.0 + dtype: float64 + + **TimedeltaIndex** + + >>> idx = cudf.from_pandas(pd.to_timedelta(np.arange(5), unit="D")) + >>> idx + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + >>> idx.total_seconds() + Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64') + """ + return self._return_result_like_self( + self.series._column.total_seconds() + ) + @_performance_tracking def _align_indices(series_list, how="outer", allow_non_unique=False): diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc new file mode 100644 index 00000000000..a0ea4fbbfc2 Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc differ diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc new file mode 100644 index 00000000000..8a7969cdbbb Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc differ diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 11a9b398b50..f3cf8e36a5b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr import contextlib @@ -1440,6 +1440,7 @@ def test_assign_callable(mapping): "sha256", "sha384", "sha512", + "xxhash32", "xxhash64", ], ) @@ -1447,6 +1448,7 @@ def test_assign_callable(mapping): def test_dataframe_hash_values(nrows, method, seed): warning_expected = seed is not None and method not in { "murmur3", + "xxhash32", "xxhash64", } potential_warning = ( @@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed): "sha256": object, "sha384": object, "sha512": object, + "xxhash32": np.uint32, "xxhash64": np.uint64, } assert out.dtype == expected_dtypes[method] @@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed): assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) -@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) +@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) def test_dataframe_hash_values_seed(method): gdf = cudf.DataFrame() data = np.arange(10) @@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method): assert_neq(out_one, out_two) +def test_dataframe_hash_values_xxhash32(): + # xxhash32 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash32", seed=0) + expected_a = cudf.Series( + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash32", seed=42) + expected_b = cudf.Series( + [1076387279, 2261349915, 531498073, 650869264, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash32", seed=0) + expected_df = cudf.Series( + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_df, expected_df) + + def test_dataframe_hash_values_xxhash64(): # xxhash64 has no built-in implementation in Python and we don't want to # add a testing dependency, so we use regression tests against known good diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c4b4ef60184..fe143e66407 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime import decimal @@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir): got = cudf.read_orc(buffer) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "inputfile", + [ + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", + ], +) +def test_orc_reader_desynced_timestamp(datadir, inputfile): + # Test a special case where the DATA stream (second) in a TIMESTAMP column + # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row + # group. In this case, the "run cache manager" in the decoder kernel is used to + # orchestrate the dual-stream processing. + # For more information, see https://github.com/rapidsai/cudf/issues/17155. + + path = datadir / inputfile + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_frame_equal(cudf.from_pandas(expect), got) diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index d622ff6b94e..f1da2a060ec 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import operator @@ -1506,3 +1506,25 @@ def test_tdi_unit(): result = pd_tdi.unit expected = cudf_tdi.unit assert result == expected + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_series_total_seconds(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + expected = psr.dt.total_seconds() + actual = gsr.dt.total_seconds() + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_index_total_seconds(request, data, dtype): + gi = cudf.Index(data, dtype=dtype) + pi = gi.to_pandas() + + expected = pi.total_seconds() + actual = gi.total_seconds() + assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ca8f9cac2d0..31a8f4de3b3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -198,7 +198,7 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( + if cudf.utils.utils._is_null_host_scalar(val) or isinstance( val, cudf.Scalar ): return val diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c83c1cbe895..0adaaa60654 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -341,6 +341,15 @@ def is_na_like(obj): return obj is None or obj is cudf.NA or obj is cudf.NaT +def _is_null_host_scalar(slr) -> bool: + # slr is NA like or NaT like + return ( + is_na_like(slr) + or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) + or slr is pd.NaT + ) + + def _warn_no_dask_cudf(fn): @functools.wraps(fn) def wrapper(self): diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index b88b109a975..92f39abe71e 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -91,7 +91,7 @@ def __init__( op = partial(self._reduce, request=req) elif name in {"min", "max"}: op = partial(op, propagate_nans=options) - elif name in {"count", "first", "last"}: + elif name in {"count", "sum", "first", "last"}: pass else: raise NotImplementedError( @@ -180,6 +180,18 @@ def _count(self, column: Column) -> Column: ) ) + def _sum(self, column: Column) -> Column: + if column.obj.size() == 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(0, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + return self._reduce(column, request=plc.aggregation.sum()) + def _min(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: return Column( diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 86cb2352dcc..15ad845ea78 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -148,3 +148,9 @@ def test_agg_singleton(op): q = df.select(op(pl.col("a"))) assert_gpu_result_equal(q) + + +def test_sum_empty_zero(): + df = pl.LazyFrame({"a": pl.Series(values=[], dtype=pl.Int32())}) + q = df.select(pl.col("a").sum()) + assert_gpu_result_equal(q) diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 20eb2404b77..863102103ed 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,7 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from importlib import import_module +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import dask.dataframe as dd from dask import config @@ -9,11 +6,16 @@ import cudf -from . import backends # noqa: F401 +from . import backends, io # noqa: F401 +from ._expr.expr import _patch_dask_expr from ._version import __git_commit__, __version__ # noqa: F401 -from .core import DataFrame, Index, Series, concat, from_cudf +from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf -QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED +if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()): + raise ValueError( + "The legacy DataFrame API is not supported in dask_cudf>24.12. " + "Please enable query-planning, or downgrade to dask_cudf<=24.12" + ) def read_csv(*args, **kwargs): @@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def _deprecated_api(old_api, new_api=None, rec=None): - def inner_func(*args, **kwargs): - if new_api: - # Use alternative - msg = f"{old_api} is now deprecated. " - msg += rec or f"Please use {new_api} instead." - warnings.warn(msg, FutureWarning) - new_attr = new_api.split(".") - module = import_module(".".join(new_attr[:-1])) - return getattr(module, new_attr[-1])(*args, **kwargs) - - # No alternative - raise an error - raise NotImplementedError( - f"{old_api} is no longer supported. " + (rec or "") - ) - - return inner_func - - -if QUERY_PLANNING_ON: - from . import io - from ._expr.expr import _patch_dask_expr - - groupby_agg = _deprecated_api("dask_cudf.groupby_agg") - read_text = DataFrame.read_text - _patch_dask_expr() - -else: - from . import io # noqa: F401 - from ._legacy.groupby import groupby_agg # noqa: F401 - from ._legacy.io import read_text # noqa: F401 - - +groupby_agg = _deprecated_api("dask_cudf.groupby_agg") +read_text = DataFrame.read_text to_orc = _deprecated_api( "dask_cudf.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.to_orc", rec="Please use DataFrame.to_orc instead.", ) +_patch_dask_expr() + + __all__ = [ "DataFrame", "Index", diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 5192e6b8171..e8c9a970b7b 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import warnings from functools import cached_property @@ -15,19 +15,11 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.dataframe.dispatch import get_parallel_type from dask.typing import no_default import cudf -_LEGACY_WORKAROUND = ( - "To enable the 'legacy' dask-cudf API, set the " - "global 'dataframe.query-planning' config to " - "`False` before dask is imported. This can also " - "be done by setting an environment variable: " - "`DASK_DATAFRAME__QUERY_PLANNING=False` " -) - - ## ## Custom collection classes ## @@ -103,9 +95,8 @@ def set_index( divisions = None warnings.warn( "Ignoring divisions='quantile'. This option is now " - "deprecated. Please use the legacy API and raise an " - "issue on github if this feature is necessary." - f"\n{_LEGACY_WORKAROUND}", + "deprecated. Please raise an issue on github if this " + "feature is necessary.", FutureWarning, ) @@ -135,9 +126,7 @@ def groupby( if kwargs.pop("as_index") is not True: raise NotImplementedError( - f"{msg} Please reset the index after aggregating, or " - "use the legacy API if `as_index=False` is required.\n" - f"{_LEGACY_WORKAROUND}" + f"{msg} Please reset the index after aggregating." ) else: warnings.warn(msg, FutureWarning) @@ -153,15 +142,15 @@ def groupby( ) def to_orc(self, *args, **kwargs): - from dask_cudf._legacy.io import to_orc + from dask_cudf.io.orc import to_orc as to_orc_impl - return to_orc(self, *args, **kwargs) + return to_orc_impl(self, *args, **kwargs) @staticmethod def read_text(*args, **kwargs): - from dask_cudf._legacy.io.text import read_text as legacy_read_text + from dask_cudf.io.text import read_text as read_text_impl - return legacy_read_text(*args, **kwargs) + return read_text_impl(*args, **kwargs) def clip(self, lower=None, upper=None, axis=1): if axis not in (None, 1): @@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) +# dask.dataframe dispatch +get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) +get_parallel_type.register(cudf.Series, lambda _: Series) +get_parallel_type.register(cudf.BaseIndex, lambda _: Index) + + +# dask_expr dispatch (might go away?) get_collection_type.register(cudf.DataFrame, lambda _: DataFrame) get_collection_type.register(cudf.Series, lambda _: Series) get_collection_type.register(cudf.BaseIndex, lambda _: Index) diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py index 8b91e53604c..03d1da0d258 100644 --- a/python/dask_cudf/dask_cudf/_expr/expr.py +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import functools import dask_expr._shuffle as _shuffle_module @@ -7,13 +7,13 @@ from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var -from dask.dataframe.core import ( - is_dataframe_like, +from dask.dataframe.dispatch import ( + is_categorical_dtype, make_meta, meta_nonempty, ) -from dask.dataframe.dispatch import is_categorical_dtype from dask.typing import no_default +from dask.utils import is_dataframe_like import cudf diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py index 0242fac6e72..a5cdd43169b 100644 --- a/python/dask_cudf/dask_cudf/_expr/groupby.py +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd from dask_expr._collection import new_collection from dask_expr._groupby import ( @@ -16,11 +17,262 @@ from dask.dataframe.groupby import Aggregation from cudf.core.groupby.groupby import _deprecate_collect +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking ## ## Fused groupby aggregations ## +OPTIMIZED_AGGS = ( + "count", + "mean", + "std", + "var", + "sum", + "min", + "max", + list, + "first", + "last", +) + + +def _make_name(col_name, sep="_"): + """Combine elements of `col_name` into a single string, or no-op if + `col_name` is already a string + """ + if isinstance(col_name, str): + return col_name + return sep.join(name for name in col_name if name != "") + + +@_dask_cudf_performance_tracking +def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): + """Initial partition-level aggregation task. + + This is the first operation to be executed on each input + partition in `groupby_agg`. Depending on `aggs`, four possible + groupby aggregations ("count", "sum", "min", and "max") are + performed. The result is then partitioned (by hashing `gb_cols`) + into a number of distinct dictionary elements. The number of + elements in the output dictionary (`split_out`) corresponds to + the number of partitions in the final output of `groupby_agg`. + """ + + # Modify dict for initial (partition-wise) aggregations + _agg_dict = {} + for col, agg_list in aggs.items(): + _agg_dict[col] = set() + for agg in agg_list: + if agg in ("mean", "std", "var"): + _agg_dict[col].add("count") + _agg_dict[col].add("sum") + else: + _agg_dict[col].add(agg) + _agg_dict[col] = list(_agg_dict[col]) + if set(agg_list).intersection({"std", "var"}): + pow2_name = _make_name((col, "pow2"), sep=sep) + df[pow2_name] = df[col].astype("float64").pow(2) + _agg_dict[pow2_name] = ["sum"] + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + _agg_dict + ) + output_columns = [_make_name(name, sep=sep) for name in gb.columns] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _tree_node_agg(df, gb_cols, dropna, sort, sep): + """Node in groupby-aggregation reduction tree. + + The input DataFrame (`df`) corresponds to the + concatenated output of one or more `_groupby_partition_agg` + tasks. In this function, "sum", "min" and/or "max" groupby + aggregations will be used to combine the statistics for + duplicate keys. + """ + + agg_dict = {} + for col in df.columns: + if col in gb_cols: + continue + agg = col.split(sep)[-1] + if agg in ("count", "sum"): + agg_dict[col] = ["sum"] + elif agg == "list": + agg_dict[col] = [list] + elif agg in OPTIMIZED_AGGS: + agg_dict[col] = [agg] + else: + raise ValueError(f"Unexpected aggregation: {agg}") + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + agg_dict + ) + + # Don't include the last aggregation in the column names + output_columns = [ + _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) + for name in gb.columns + ] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): + """Calculate variance (given count, sum, and sum-squared columns).""" + + # Select count, sum, and sum-squared + n = df[count_name] + x = df[sum_name] + x2 = df[pow2_sum_name] + + # Use sum-squared approach to get variance + var = x2 - x**2 / n + div = n - ddof + div[div < 1] = 1 # Avoid division by 0 + var /= div + + # Set appropriate NaN elements + # (since we avoided 0-division) + var[(n - ddof) == 0] = np.nan + + return var + + +@_dask_cudf_performance_tracking +def _finalize_gb_agg( + gb_in, + gb_cols, + aggs, + columns, + final_columns, + as_index, + dropna, + sort, + sep, + str_cols_out, + aggs_renames, +): + """Final aggregation task. + + This is the final operation on each output partitions + of the `groupby_agg` algorithm. This function must + take care of higher-order aggregations, like "mean", + "std" and "var". We also need to deal with the column + index, the row index, and final sorting behavior. + """ + + gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) + + # Deal with higher-order aggregations + for col in columns: + agg_list = aggs.get(col, []) + agg_set = set(agg_list) + if agg_set.intersection({"mean", "std", "var"}): + count_name = _make_name((col, "count"), sep=sep) + sum_name = _make_name((col, "sum"), sep=sep) + if agg_set.intersection({"std", "var"}): + pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) + var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) + if "var" in agg_list: + name_var = _make_name((col, "var"), sep=sep) + gb[name_var] = var + if "std" in agg_list: + name_std = _make_name((col, "std"), sep=sep) + gb[name_std] = np.sqrt(var) + gb.drop(columns=[pow2_sum_name], inplace=True) + if "mean" in agg_list: + mean_name = _make_name((col, "mean"), sep=sep) + gb[mean_name] = gb[sum_name] / gb[count_name] + if "sum" not in agg_list: + gb.drop(columns=[sum_name], inplace=True) + if "count" not in agg_list: + gb.drop(columns=[count_name], inplace=True) + if list in agg_list: + collect_name = _make_name((col, "list"), sep=sep) + gb[collect_name] = gb[collect_name].list.concat() + + # Ensure sorted keys if `sort=True` + if sort: + gb = gb.sort_values(gb_cols) + + # Set index if necessary + if as_index: + gb.set_index(gb_cols, inplace=True) + + # Unflatten column names + col_array = [] + agg_array = [] + for col in gb.columns: + if col in gb_cols: + col_array.append(col) + agg_array.append("") + else: + name, agg = col.split(sep) + col_array.append(name) + agg_array.append(aggs_renames.get((name, agg), agg)) + if str_cols_out: + gb.columns = col_array + else: + gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + + return gb[final_columns] + + +@_dask_cudf_performance_tracking +def _redirect_aggs(arg): + """Redirect aggregations to their corresponding name in cuDF""" + redirects = { + sum: "sum", + max: "max", + min: "min", + "collect": list, + "list": list, + } + if isinstance(arg, dict): + new_arg = dict() + for col in arg: + if isinstance(arg[col], list): + new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] + elif isinstance(arg[col], dict): + new_arg[col] = { + k: redirects.get(v, v) for k, v in arg[col].items() + } + else: + new_arg[col] = redirects.get(arg[col], arg[col]) + return new_arg + if isinstance(arg, list): + return [redirects.get(agg, agg) for agg in arg] + return redirects.get(arg, arg) + + +@_dask_cudf_performance_tracking +def _aggs_optimized(arg, supported: set): + """Check that aggregations in `arg` are a subset of `supported`""" + if isinstance(arg, (list, dict)): + if isinstance(arg, dict): + _global_set: set[str] = set() + for col in arg: + if isinstance(arg[col], list): + _global_set = _global_set.union(set(arg[col])) + elif isinstance(arg[col], dict): + _global_set = _global_set.union(set(arg[col].values())) + else: + _global_set.add(arg[col]) + else: + _global_set = set(arg) + + return bool(_global_set.issubset(supported)) + elif isinstance(arg, (str, type)): + return arg in supported + return False + def _get_spec_info(gb): if isinstance(gb.arg, (dict, list)): @@ -105,20 +357,14 @@ def shuffle_by_index(self): @classmethod def chunk(cls, df, *by, **kwargs): - from dask_cudf._legacy.groupby import _groupby_partition_agg - return _groupby_partition_agg(df, **kwargs) @classmethod def combine(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _tree_node_agg - return _tree_node_agg(_concat(inputs), **kwargs) @classmethod def aggregate(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _finalize_gb_agg - return _finalize_gb_agg(_concat(inputs), **kwargs) @property @@ -193,12 +439,6 @@ def _maybe_get_custom_expr( shuffle_method=None, **kwargs, ): - from dask_cudf._legacy.groupby import ( - OPTIMIZED_AGGS, - _aggs_optimized, - _redirect_aggs, - ) - if kwargs: # Unsupported key-word arguments return None diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py deleted file mode 100644 index d6beb775a5e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/core.py +++ /dev/null @@ -1,711 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import math -import warnings - -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname - -import cudf -from cudf import _lib as libcudf -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._expr.accessors import ListMethods, StructMethods -from dask_cudf._legacy import sorting -from dask_cudf._legacy.sorting import ( - _deprecate_shuffle_kwarg, - _get_shuffle_method, -) - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf._legacy.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf._legacy.io.to_orc""" - from dask_cudf._legacy.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, - ) - - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py deleted file mode 100644 index 7e01e91476d..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/groupby.py +++ /dev/null @@ -1,909 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from functools import wraps - -import numpy as np -import pandas as pd - -from dask.dataframe.core import ( - DataFrame as DaskDataFrame, - aca, - split_out_on_cols, -) -from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy -from dask.utils import funcname - -import cudf -from cudf.core.groupby.groupby import _deprecate_collect -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg - -# aggregations that are dask-cudf optimized -OPTIMIZED_AGGS = ( - "count", - "mean", - "std", - "var", - "sum", - "min", - "max", - list, - "first", - "last", -) - - -def _check_groupby_optimized(func): - """ - Decorator for dask-cudf's groupby methods that returns the dask-cudf - optimized method if the groupby object is supported, otherwise - reverting to the upstream Dask method - """ - - @wraps(func) - def wrapper(*args, **kwargs): - gb = args[0] - if _groupby_optimized(gb): - return func(*args, **kwargs) - # note that we use upstream Dask's default kwargs for this call if - # none are specified; this shouldn't be an issue as those defaults are - # consistent with dask-cudf - return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs) - - return wrapper - - -class CudfDataFrameGroupBy(DataFrameGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - def __getitem__(self, key): - if isinstance(key, list): - g = CudfDataFrameGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - else: - g = CudfSeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - - g._meta = g._meta[key] - return g - - @_dask_cudf_performance_tracking - def _make_groupby_method_aggs(self, agg_name): - """Create aggs dictionary for aggregation methods""" - - if isinstance(self.by, list): - return {c: agg_name for c in self.obj.columns if c not in self.by} - return {c: agg_name for c in self.obj.columns if c != self.by} - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("count"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("mean"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("std"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("var"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("sum"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("min"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("max"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs(list), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("first"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("last"), - split_every, - split_out, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - if isinstance(self._meta.grouping.keys, cudf.MultiIndex): - keys = self._meta.grouping.keys.names - else: - keys = self._meta.grouping.keys.name - - return groupby_agg( - self.obj, - keys, - arg, - split_every=split_every, - split_out=split_out, - sep=self.sep, - sort=self.sort, - as_index=self.as_index, - shuffle_method=shuffle_method, - **self.dropna, - ) - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -class CudfSeriesGroupBy(SeriesGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "count"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "mean"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "std"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "var"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "sum"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "min"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "max"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - {self._slice: list}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "first"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "last"}, - split_every, - split_out, - )[self._slice] - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if not isinstance(arg, dict): - arg = {self._slice: arg} - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - return _make_groupby_agg_call( - self, arg, split_every, split_out, shuffle_method - )[self._slice] - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -def _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token=None, - sort=None, - shuffle_method=None, -): - # Shuffle-based groupby aggregation - # NOTE: This function is the dask_cudf version of - # dask.dataframe.groupby._shuffle_aggregate - - # Step 1 - Chunkwise groupby operation - chunk_name = f"{token or funcname(chunk)}-chunk" - chunked = ddf.map_partitions( - chunk, - meta=chunk(ddf._meta, **chunk_kwargs), - token=chunk_name, - **chunk_kwargs, - ) - - # Step 2 - Perform global sort or shuffle - shuffle_npartitions = max( - chunked.npartitions // split_every, - split_out, - ) - if sort and split_out > 1: - # Sort-based code path - result = ( - chunked.repartition(npartitions=shuffle_npartitions) - .sort_values( - gb_cols, - ignore_index=True, - shuffle_method=shuffle_method, - ) - .map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - ) - else: - # Hash-based code path - result = chunked.shuffle( - gb_cols, - npartitions=shuffle_npartitions, - ignore_index=True, - shuffle_method=shuffle_method, - ).map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - - # Step 3 - Repartition and return - if split_out < result.npartitions: - return result.repartition(npartitions=split_out) - return result - - -@_dask_cudf_performance_tracking -def groupby_agg( - ddf, - gb_cols, - aggs_in, - split_every=None, - split_out=None, - dropna=True, - sep="___", - sort=False, - as_index=True, - shuffle_method=None, -): - """Optimized groupby aggregation for Dask-CuDF. - - Parameters - ---------- - ddf : DataFrame - DataFrame object to perform grouping on. - gb_cols : str or list[str] - Column names to group by. - aggs_in : str, list, or dict - Aggregations to perform. - split_every : int (optional) - How to group intermediate aggregates. - dropna : bool - Drop grouping key values corresponding to NA values. - as_index : bool - Currently ignored. - sort : bool - Sort the group keys, better performance is obtained when - not sorting. - shuffle_method : str (optional) - Control how shuffling of the DataFrame is performed. - sep : str - Internal usage. - - - Notes - ----- - This "optimized" approach is more performant than the algorithm in - implemented in :meth:`DataFrame.apply` because it allows the cuDF - backend to perform multiple aggregations at once. - - This aggregation algorithm only supports the following options - - * "list" - * "count" - * "first" - * "last" - * "max" - * "mean" - * "min" - * "std" - * "sum" - * "var" - - - See Also - -------- - DataFrame.groupby : generic groupby of a DataFrame - dask.dataframe.apply_concat_apply : for more description of the - split_every argument. - - """ - # Assert that aggregations are supported - aggs = _redirect_aggs(aggs_in) - if not _aggs_optimized(aggs, OPTIMIZED_AGGS): - raise ValueError( - f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. " - f"Aggregations must be specified with dict or list syntax." - ) - - # If split_every is False, we use an all-to-one reduction - if split_every is False: - split_every = max(ddf.npartitions, 2) - - # Deal with default split_out and split_every params - split_every = split_every or 8 - split_out = split_out or 1 - - # Standardize `gb_cols`, `columns`, and `aggs` - if isinstance(gb_cols, str): - gb_cols = [gb_cols] - columns = [c for c in ddf.columns if c not in gb_cols] - if not isinstance(aggs, dict): - aggs = {col: aggs for col in columns} - - # Assert if our output will have a MultiIndex; this will be the case if - # any value in the `aggs` dict is not a string (i.e. multiple/named - # aggregations per column) - str_cols_out = True - aggs_renames = {} - for col in aggs: - if isinstance(aggs[col], str) or callable(aggs[col]): - aggs[col] = [aggs[col]] - elif isinstance(aggs[col], dict): - str_cols_out = False - col_aggs = [] - for k, v in aggs[col].items(): - aggs_renames[col, v] = k - col_aggs.append(v) - aggs[col] = col_aggs - else: - str_cols_out = False - if col in gb_cols: - columns.append(col) - - # Construct meta - _aggs = aggs.copy() - if str_cols_out: - # Metadata should use `str` for dict values if that is - # what the user originally specified (column names will - # be str, rather than tuples). - for col in aggs: - _aggs[col] = _aggs[col][0] - _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) - if aggs_renames: - col_array = [] - agg_array = [] - for col, agg in _meta.columns: - col_array.append(col) - agg_array.append(aggs_renames.get((col, agg), agg)) - _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - chunk = _groupby_partition_agg - chunk_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - combine = _tree_node_agg - combine_kwargs = { - "gb_cols": gb_cols, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - aggregate = _finalize_gb_agg - aggregate_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "final_columns": _meta.columns, - "as_index": as_index, - "dropna": dropna, - "sort": sort, - "sep": sep, - "str_cols_out": str_cols_out, - "aggs_renames": aggs_renames, - } - - # Use shuffle_method=True for split_out>1 - if sort and split_out > 1 and shuffle_method is None: - shuffle_method = "tasks" - - # Check if we are using the shuffle-based algorithm - if shuffle_method: - # Shuffle-based aggregation - return _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token="cudf-aggregate", - sort=sort, - shuffle_method=shuffle_method - if isinstance(shuffle_method, str) - else None, - ) - - # Deal with sort/shuffle defaults - if split_out > 1 and sort: - raise ValueError( - "dask-cudf's groupby algorithm does not yet support " - "`sort=True` when `split_out>1`, unless a shuffle-based " - "algorithm is used. Please use `split_out=1`, group " - "with `sort=False`, or set `shuffle_method=True`." - ) - - # Determine required columns to enable column projection - required_columns = list( - set(gb_cols).union(aggs.keys()).intersection(ddf.columns) - ) - - return aca( - [ddf[required_columns]], - chunk=chunk, - chunk_kwargs=chunk_kwargs, - combine=combine, - combine_kwargs=combine_kwargs, - aggregate=aggregate, - aggregate_kwargs=aggregate_kwargs, - token="cudf-aggregate", - split_every=split_every, - split_out=split_out, - split_out_setup=split_out_on_cols, - split_out_setup_kwargs={"cols": gb_cols}, - sort=sort, - ignore_index=True, - ) - - -@_dask_cudf_performance_tracking -def _make_groupby_agg_call( - gb, aggs, split_every, split_out, shuffle_method=None -): - """Helper method to consolidate the common `groupby_agg` call for all - aggregations in one place - """ - - return groupby_agg( - gb.obj, - gb.by, - aggs, - split_every=split_every, - split_out=split_out, - sep=gb.sep, - sort=gb.sort, - as_index=gb.as_index, - shuffle_method=shuffle_method, - **gb.dropna, - ) - - -@_dask_cudf_performance_tracking -def _redirect_aggs(arg): - """Redirect aggregations to their corresponding name in cuDF""" - redirects = { - sum: "sum", - max: "max", - min: "min", - "collect": list, - "list": list, - } - if isinstance(arg, dict): - new_arg = dict() - for col in arg: - if isinstance(arg[col], list): - new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] - elif isinstance(arg[col], dict): - new_arg[col] = { - k: redirects.get(v, v) for k, v in arg[col].items() - } - else: - new_arg[col] = redirects.get(arg[col], arg[col]) - return new_arg - if isinstance(arg, list): - return [redirects.get(agg, agg) for agg in arg] - return redirects.get(arg, arg) - - -@_dask_cudf_performance_tracking -def _aggs_optimized(arg, supported: set): - """Check that aggregations in `arg` are a subset of `supported`""" - if isinstance(arg, (list, dict)): - if isinstance(arg, dict): - _global_set: set[str] = set() - for col in arg: - if isinstance(arg[col], list): - _global_set = _global_set.union(set(arg[col])) - elif isinstance(arg[col], dict): - _global_set = _global_set.union(set(arg[col].values())) - else: - _global_set.add(arg[col]) - else: - _global_set = set(arg) - - return bool(_global_set.issubset(supported)) - elif isinstance(arg, (str, type)): - return arg in supported - return False - - -@_dask_cudf_performance_tracking -def _groupby_optimized(gb): - """Check that groupby input can use dask-cudf optimized codepath""" - return isinstance(gb.obj, DaskDataFrame) and ( - isinstance(gb.by, str) - or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by)) - ) - - -def _make_name(col_name, sep="_"): - """Combine elements of `col_name` into a single string, or no-op if - `col_name` is already a string - """ - if isinstance(col_name, str): - return col_name - return sep.join(name for name in col_name if name != "") - - -@_dask_cudf_performance_tracking -def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): - """Initial partition-level aggregation task. - - This is the first operation to be executed on each input - partition in `groupby_agg`. Depending on `aggs`, four possible - groupby aggregations ("count", "sum", "min", and "max") are - performed. The result is then partitioned (by hashing `gb_cols`) - into a number of distinct dictionary elements. The number of - elements in the output dictionary (`split_out`) corresponds to - the number of partitions in the final output of `groupby_agg`. - """ - - # Modify dict for initial (partition-wise) aggregations - _agg_dict = {} - for col, agg_list in aggs.items(): - _agg_dict[col] = set() - for agg in agg_list: - if agg in ("mean", "std", "var"): - _agg_dict[col].add("count") - _agg_dict[col].add("sum") - else: - _agg_dict[col].add(agg) - _agg_dict[col] = list(_agg_dict[col]) - if set(agg_list).intersection({"std", "var"}): - pow2_name = _make_name((col, "pow2"), sep=sep) - df[pow2_name] = df[col].astype("float64").pow(2) - _agg_dict[pow2_name] = ["sum"] - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - _agg_dict - ) - output_columns = [_make_name(name, sep=sep) for name in gb.columns] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _tree_node_agg(df, gb_cols, dropna, sort, sep): - """Node in groupby-aggregation reduction tree. - - The input DataFrame (`df`) corresponds to the - concatenated output of one or more `_groupby_partition_agg` - tasks. In this function, "sum", "min" and/or "max" groupby - aggregations will be used to combine the statistics for - duplicate keys. - """ - - agg_dict = {} - for col in df.columns: - if col in gb_cols: - continue - agg = col.split(sep)[-1] - if agg in ("count", "sum"): - agg_dict[col] = ["sum"] - elif agg == "list": - agg_dict[col] = [list] - elif agg in OPTIMIZED_AGGS: - agg_dict[col] = [agg] - else: - raise ValueError(f"Unexpected aggregation: {agg}") - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - agg_dict - ) - - # Don't include the last aggregation in the column names - output_columns = [ - _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) - for name in gb.columns - ] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): - """Calculate variance (given count, sum, and sum-squared columns).""" - - # Select count, sum, and sum-squared - n = df[count_name] - x = df[sum_name] - x2 = df[pow2_sum_name] - - # Use sum-squared approach to get variance - var = x2 - x**2 / n - div = n - ddof - div[div < 1] = 1 # Avoid division by 0 - var /= div - - # Set appropriate NaN elements - # (since we avoided 0-division) - var[(n - ddof) == 0] = np.nan - - return var - - -@_dask_cudf_performance_tracking -def _finalize_gb_agg( - gb_in, - gb_cols, - aggs, - columns, - final_columns, - as_index, - dropna, - sort, - sep, - str_cols_out, - aggs_renames, -): - """Final aggregation task. - - This is the final operation on each output partitions - of the `groupby_agg` algorithm. This function must - take care of higher-order aggregations, like "mean", - "std" and "var". We also need to deal with the column - index, the row index, and final sorting behavior. - """ - - gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) - - # Deal with higher-order aggregations - for col in columns: - agg_list = aggs.get(col, []) - agg_set = set(agg_list) - if agg_set.intersection({"mean", "std", "var"}): - count_name = _make_name((col, "count"), sep=sep) - sum_name = _make_name((col, "sum"), sep=sep) - if agg_set.intersection({"std", "var"}): - pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) - var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) - if "var" in agg_list: - name_var = _make_name((col, "var"), sep=sep) - gb[name_var] = var - if "std" in agg_list: - name_std = _make_name((col, "std"), sep=sep) - gb[name_std] = np.sqrt(var) - gb.drop(columns=[pow2_sum_name], inplace=True) - if "mean" in agg_list: - mean_name = _make_name((col, "mean"), sep=sep) - gb[mean_name] = gb[sum_name] / gb[count_name] - if "sum" not in agg_list: - gb.drop(columns=[sum_name], inplace=True) - if "count" not in agg_list: - gb.drop(columns=[count_name], inplace=True) - if list in agg_list: - collect_name = _make_name((col, "list"), sep=sep) - gb[collect_name] = gb[collect_name].list.concat() - - # Ensure sorted keys if `sort=True` - if sort: - gb = gb.sort_values(gb_cols) - - # Set index if necessary - if as_index: - gb.set_index(gb_cols, inplace=True) - - # Unflatten column names - col_array = [] - agg_array = [] - for col in gb.columns: - if col in gb_cols: - col_array.append(col) - agg_array.append("") - else: - name, agg = col.split(sep) - col_array.append(name) - agg_array.append(aggs_renames.get((name, agg), agg)) - if str_cols_out: - gb.columns = col_array - else: - gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - return gb[final_columns] diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py index 0421bd755f4..c544c32523f 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -1,11 +1 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from .csv import read_csv # noqa: F401 -from .json import read_json # noqa: F401 -from .orc import read_orc, to_orc # noqa: F401 -from .text import read_text # noqa: F401 - -try: - from .parquet import read_parquet, to_parquet # noqa: F401 -except ImportError: - pass +# Copyright (c) 2018-2025, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py deleted file mode 100644 index fa5400344f9..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import os -from glob import glob -from warnings import warn - -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py deleted file mode 100644 index 98c5ceedb76..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/json.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from functools import partial - -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py deleted file mode 100644 index fcf684fd6c8..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(source, fs, columns=None, kwargs=None): - """Pull out specific columns from specific stripe""" - path, stripe = source - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, _, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - sources = [] - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - sources.append((path, stripe)) - - return dd.from_map( - _read_orc_stripe, - sources, - args=[fs], - columns=columns, - kwargs=kwargs, - meta=meta, - ) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0638e4a1c3..c0792663c7e 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools import warnings from functools import partial @@ -8,7 +8,7 @@ import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq -from dask import dataframe as dd +import dask.dataframe as dd from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine try: @@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet(path, columns=None, **kwargs): - """ - Read parquet files into a :class:`.DataFrame`. - - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. - - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. - - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP - - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: - - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP - - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: - - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP - - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] - - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) - if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) - ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size - - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) - - -to_parquet = partial(dd.to_parquet, engine=CudfEngine) +to_parquet = dd.to_parquet if create_metadata_file_dd is None: create_metadata_file = create_metadata_file_dd diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py deleted file mode 100644 index 3757c85c80c..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/text.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import os -from glob import glob - -import dask.dataframe as dd -from dask.utils import parse_bytes - -import cudf - - -def _read_text(source, **kwargs): - # Wrapper for cudf.read_text operation - fn, byte_range = source - return cudf.read_text(fn, byte_range=byte_range, **kwargs) - - -def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - if chunksize and byte_range: - raise ValueError("Cannot specify both chunksize and byte_range.") - - if chunksize: - sources = [] - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - byte_range = ( - start, - chunksize, - ) # specify which chunk of the file we care about - sources.append((fn, byte_range)) - else: - sources = [(fn, byte_range) for fn in filenames] - - return dd.from_map( - _read_text, - sources, - meta=cudf.Series([], dtype="O"), - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py deleted file mode 100644 index a2ba4d1878e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/sorting.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings -from collections.abc import Iterator -from functools import wraps - -import cupy -import numpy as np -import tlz as toolz - -from dask import config -from dask.base import tokenize -from dask.dataframe import methods -from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M - -import cudf -from cudf.api.types import _is_categorical_dtype -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -_SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported - - -def _deprecate_shuffle_kwarg(func): - @wraps(func) - def wrapper(*args, **kwargs): - old_arg_value = kwargs.pop("shuffle", None) - - if old_arg_value is not None: - new_arg_value = old_arg_value - msg = ( - "the 'shuffle' keyword is deprecated, " - "use 'shuffle_method' instead." - ) - - warnings.warn(msg, FutureWarning) - if kwargs.get("shuffle_method") is not None: - msg = ( - "Can only specify 'shuffle' " - "or 'shuffle_method', not both." - ) - raise TypeError(msg) - kwargs["shuffle_method"] = new_arg_value - return func(*args, **kwargs) - - return wrapper - - -@_dask_cudf_performance_tracking -def set_index_post(df, index_name, drop, column_dtype): - df2 = df.set_index(index_name, drop=drop) - df2.columns = df2.columns.astype(column_dtype) - return df2 - - -@_dask_cudf_performance_tracking -def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): - if ascending: - partitions = divisions.searchsorted(s, side="right") - 1 - else: - partitions = ( - len(divisions) - divisions.searchsorted(s, side="right") - 1 - ) - partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( - 0 if ascending else (len(divisions) - 2) - ) - partitions[s._columns[0].isnull().values] = ( - len(divisions) - 2 if na_position == "last" else 0 - ) - return partitions - - -@_dask_cudf_performance_tracking -def _quantile(a, q): - n = len(a) - if not len(a): - return None, n - return ( - a.quantile(q=q.tolist(), interpolation="nearest", method="table"), - n, - ) - - -@_dask_cudf_performance_tracking -def merge_quantiles(finalq, qs, vals): - """Combine several quantile calculations of different data. - [NOTE: Same logic as dask.array merge_percentiles] - """ - if isinstance(finalq, Iterator): - finalq = list(finalq) - finalq = np.array(finalq) - qs = list(map(list, qs)) - vals = list(vals) - vals, Ns = zip(*vals) - Ns = list(Ns) - - L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) - if not L: - raise ValueError("No non-trivial arrays found") - qs, vals, Ns = L - - if len(vals) != len(qs) or len(Ns) != len(qs): - raise ValueError("qs, vals, and Ns parameters must be the same length") - - # transform qs and Ns into number of observations between quantiles - counts = [] - for q, N in zip(qs, Ns): - count = np.empty(len(q)) - count[1:] = np.diff(q) - count[0] = q[0] - count *= N - counts.append(count) - - def _append_counts(val, count): - val["_counts"] = count - return val - - # Sort by calculated quantile values, then number of observations. - combined_vals_counts = cudf.core.reshape._merge_sorted( - [*map(_append_counts, vals, counts)] - ) - combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) - combined_vals = combined_vals_counts.drop(columns=["_counts"]) - - # quantile-like, but scaled by total number of observations - combined_q = np.cumsum(combined_counts) - - # rescale finalq quantiles to match combined_q - desired_q = finalq * sum(Ns) - - # TODO: Support other interpolation methods - # For now - Always use "nearest" for interpolation - left = np.searchsorted(combined_q, desired_q, side="left") - right = np.searchsorted(combined_q, desired_q, side="right") - 1 - np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index - lower = np.minimum(left, right) - upper = np.maximum(left, right) - lower_residual = np.abs(combined_q[lower] - desired_q) - upper_residual = np.abs(combined_q[upper] - desired_q) - mask = lower_residual > upper_residual - index = lower # alias; we no longer need lower - index[mask] = upper[mask] - rv = combined_vals.iloc[index] - return rv.reset_index(drop=True) - - -@_dask_cudf_performance_tracking -def _approximate_quantile(df, q): - """Approximate quantiles of DataFrame or Series. - [NOTE: Same logic as dask.dataframe Series quantile] - """ - # current implementation needs q to be sorted so - # sort if array-like, otherwise leave it alone - q_ndarray = np.array(q) - if q_ndarray.ndim > 0: - q_ndarray.sort(kind="mergesort") - q = q_ndarray - - # Lets assume we are dealing with a DataFrame throughout - if isinstance(df, (Series, Index)): - df = df.to_frame() - assert isinstance(df, DataFrame) - final_type = df._meta._constructor - - # Create metadata - meta = df._meta_nonempty.quantile(q=q, method="table") - - # Define final action (create df with quantiles as index) - def finalize_tsk(tsk): - return (final_type, tsk) - - return_type = df.__class__ - - # pandas/cudf uses quantile in [0, 1] - # numpy / cupy uses [0, 100] - qs = np.asarray(q) - token = tokenize(df, qs) - - if len(qs) == 0: - name = "quantiles-" + token - empty_index = cudf.Index([], dtype=float) - return Series( - { - (name, 0): final_type( - {col: [] for col in df.columns}, - name=df.name, - index=empty_index, - ) - }, - name, - df._meta, - [None, None], - ) - else: - new_divisions = [np.min(q), np.max(q)] - - name = "quantiles-1-" + token - val_dsk = { - (name, i): (_quantile, key, qs) - for i, key in enumerate(df.__dask_keys__()) - } - - name2 = "quantiles-2-" + token - merge_dsk = { - (name2, 0): finalize_tsk( - (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)) - ) - } - dsk = toolz.merge(val_dsk, merge_dsk) - graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) - df = return_type(graph, name2, meta, new_divisions) - - def set_quantile_index(df): - df.index = q - return df - - df = df.map_partitions(set_quantile_index, meta=meta) - return df - - -@_dask_cudf_performance_tracking -def quantile_divisions(df, by, npartitions): - qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() - divisions = _approximate_quantile(df[by], qn).compute() - columns = divisions.columns - - # TODO: Make sure divisions are correct for all dtypes.. - if ( - len(columns) == 1 - and df[columns[0]].dtype != "object" - and not _is_categorical_dtype(df[columns[0]].dtype) - ): - dtype = df[columns[0]].dtype - divisions = divisions[columns[0]].astype("int64") - divisions.iloc[-1] += 1 - divisions = sorted( - divisions.drop_duplicates().astype(dtype).to_arrow().tolist(), - key=lambda x: (x is None, x), - ) - else: - for col in columns: - dtype = df[col].dtype - if dtype != "object": - divisions[col] = divisions[col].astype("int64") - divisions[col].iloc[-1] += 1 - divisions[col] = divisions[col].astype(dtype) - else: - if last := divisions[col].iloc[-1]: - val = chr(ord(last[0]) + 1) - else: - val = "this string intentionally left empty" # any but "" - divisions[col].iloc[-1] = val - divisions = divisions.drop_duplicates().sort_index() - return divisions - - -@_deprecate_shuffle_kwarg -@_dask_cudf_performance_tracking -def sort_values( - df, - by, - max_branch=None, - divisions=None, - set_divisions=False, - ignore_index=False, - ascending=True, - na_position="last", - shuffle_method=None, - sort_function=None, - sort_function_kwargs=None, -): - """Sort by the given list/tuple of column names.""" - - if not isinstance(ascending, bool): - raise ValueError("ascending must be either True or False") - if na_position not in ("first", "last"): - raise ValueError("na_position must be either 'first' or 'last'") - - npartitions = df.npartitions - if isinstance(by, tuple): - by = list(by) - elif not isinstance(by, list): - by = [by] - - # parse custom sort function / kwargs if provided - sort_kwargs = { - "by": by, - "ascending": ascending, - "na_position": na_position, - } - if sort_function is None: - sort_function = M.sort_values - if sort_function_kwargs is not None: - sort_kwargs.update(sort_function_kwargs) - - # handle single partition case - if npartitions == 1: - return df.map_partitions(sort_function, **sort_kwargs) - - # Step 1 - Calculate new divisions (if necessary) - if divisions is None: - divisions = quantile_divisions(df, by, npartitions) - - # Step 2 - Perform repartitioning shuffle - meta = df._meta._constructor_sliced([0]) - if not isinstance(divisions, (cudf.Series, cudf.DataFrame)): - dtype = df[by[0]].dtype - divisions = df._meta._constructor_sliced(divisions, dtype=dtype) - - partitions = df[by].map_partitions( - _set_partitions_pre, - divisions=divisions, - ascending=ascending, - na_position=na_position, - meta=meta, - ) - - df2 = df.assign(_partitions=partitions) - df3 = rearrange_by_column( - df2, - "_partitions", - max_branch=max_branch, - npartitions=len(divisions) - 1, - shuffle_method=_get_shuffle_method(shuffle_method), - ignore_index=ignore_index, - ).drop(columns=["_partitions"]) - df3.divisions = (None,) * (df3.npartitions + 1) - - # Step 3 - Return final sorted df - df4 = df3.map_partitions(sort_function, **sort_kwargs) - if not isinstance(divisions, cudf.DataFrame) and set_divisions: - # Can't have multi-column divisions elsewhere in dask (yet) - df4.divisions = tuple(methods.tolist(divisions)) - - return df4 - - -def get_default_shuffle_method(): - # Note that `dask.utils.get_default_shuffle_method` - # will return "p2p" by default when a distributed - # client is present. Dask-cudf supports "p2p", but - # will not use it by default (yet) - default = config.get("dataframe.shuffle.method", "tasks") - if default not in _SHUFFLE_SUPPORT: - default = "tasks" - return default - - -def _get_shuffle_method(shuffle_method): - # Utility to set the shuffle_method-kwarg default - # and to validate user-specified options - shuffle_method = shuffle_method or get_default_shuffle_method() - if shuffle_method not in _SHUFFLE_SUPPORT: - raise ValueError( - "Dask-cudf only supports the following shuffle " - f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}" - ) - - return shuffle_method diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index fceaaf185e8..f33733d9583 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -11,14 +11,12 @@ from packaging.version import Version from pandas.api.types import is_scalar -import dask.dataframe as dd from dask import config from dask.array.dispatch import percentile_lookup from dask.dataframe.backends import ( DataFrameBackendEntrypoint, PandasBackendEntrypoint, ) -from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( categorical_dtype_dispatch, concat_dispatch, @@ -28,6 +26,8 @@ hash_object_dispatch, is_categorical_dtype_dispatch, make_meta_dispatch, + meta_nonempty, + partd_encode_dispatch, pyarrow_schema_dispatch, to_pyarrow_table_dispatch, tolist_dispatch, @@ -46,13 +46,6 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from ._legacy.core import DataFrame, Index, Series - -get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) -get_parallel_type.register(cudf.Series, lambda _: Series) -get_parallel_type.register(cudf.BaseIndex, lambda _: Index) - - # Required for Arrow filesystem support in read_parquet PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") @@ -318,7 +311,7 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( - (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) + (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype) # , Series) ) @_dask_cudf_performance_tracking def is_categorical_dtype_cudf(obj): @@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj): return obj.memory_usage() -# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0 -try: - from dask.dataframe.dispatch import partd_encode_dispatch - - @partd_encode_dispatch.register(cudf.DataFrame) - def _simple_cudf_encode(_): - # Basic pickle-based encoding for a partd k-v store - import pickle +@partd_encode_dispatch.register(cudf.DataFrame) +def _simple_cudf_encode(_): + # Basic pickle-based encoding for a partd k-v store + import pickle - import partd + import partd - def join(dfs): - if not dfs: - return cudf.DataFrame() - else: - return cudf.concat(dfs) - - dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) - return partial(partd.Encode, dumps, pickle.loads, join) + def join(dfs): + if not dfs: + return cudf.DataFrame() + else: + return cudf.concat(dfs) -except ImportError: - pass + dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) + return partial(partd.Encode, dumps, pickle.loads, join) def _default_backend(func, *args, **kwargs): @@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs): return data -# Define "cudf" backend engine to be registered with Dask -class CudfBackendEntrypoint(DataFrameBackendEntrypoint): - """Backend-entrypoint class for Dask-DataFrame +# Define the "cudf" backend for "legacy" Dask DataFrame +class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint): + """Backend-entrypoint class for legacy Dask-DataFrame This class is registered under the name "cudf" for the - ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. - Dask-DataFrame will use the methods defined in this class - in place of ``dask.dataframe.`` when the - "dataframe.backend" configuration is set to "cudf": - - Examples - -------- - >>> import dask - >>> import dask.dataframe as dd - >>> with dask.config.set({"dataframe.backend": "cudf"}): - ... ddf = dd.from_dict({"a": range(10)}) - >>> type(ddf) - + ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``. + This "legacy" backend is only used for CSV support. """ - @classmethod - def to_backend_dispatch(cls): - return to_cudf_dispatch - - @classmethod - def to_backend(cls, data: dd.core._Frame, **kwargs): - if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)): - # Already a cudf-backed collection - _unsupported_kwargs("cudf", "cudf", kwargs) - return data - return data.map_partitions(cls.to_backend_dispatch(), **kwargs) - - @staticmethod - def from_dict( - data, - npartitions, - orient="columns", - dtype=None, - columns=None, - constructor=cudf.DataFrame, - ): - return _default_backend( - dd.from_dict, - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - constructor=constructor, - ) - - @staticmethod - def read_parquet(*args, engine=None, **kwargs): - from dask_cudf._legacy.io.parquet import CudfEngine - - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dd.read_parquet, - *args, - engine=CudfEngine, - **kwargs, - ) - - @staticmethod - def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json - - return read_json(*args, **kwargs) - @staticmethod - def read_orc(*args, **kwargs): - from dask_cudf._legacy.io import read_orc - - return read_orc(*args, **kwargs) - - @staticmethod - def read_csv(*args, **kwargs): - from dask_cudf._legacy.io import read_csv - - return read_csv(*args, **kwargs) - - @staticmethod - def read_hdf(*args, **kwargs): - # HDF5 reader not yet implemented in cudf - warnings.warn( - "read_hdf is not yet implemented in cudf/dask_cudf. " - "Moving to cudf from pandas. Expect poor performance!" - ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( - "cudf" - ) - - -# Define "cudf" backend entrypoint for dask-expr -class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): +# Define the "cudf" backend for expr-based Dask DataFrame +class CudfBackendEntrypoint(DataFrameBackendEntrypoint): """Backend-entrypoint class for Dask-Expressions This class is registered under the name "cudf" for the - ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``. + ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``. Dask-DataFrame will use the methods defined in this class in place of ``dask_expr.`` when the "dataframe.backend" configuration is set to "cudf": @@ -746,12 +649,12 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json as read_json_impl + from dask_cudf.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc + from dask_cudf.io.orc import read_orc as legacy_read_orc return legacy_read_orc(*args, **kwargs) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 5fd217209ec..32461104ef9 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,56 +1,41 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import textwrap +import warnings +from importlib import import_module import dask.dataframe as dd -from dask.tokenize import tokenize import cudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking # This module provides backward compatibility for legacy import patterns. -if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( - DataFrame, - Index, - Series, - ) -else: - from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - +from dask_cudf._expr.collection import ( + DataFrame, # noqa: F401 + Index, # noqa: F401 + Series, # noqa: F401 +) concat = dd.concat @_dask_cudf_performance_tracking def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): - from dask_cudf import QUERY_PLANNING_ON - if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( "dask_cudf does not support MultiIndex Dataframes." ) - # Dask-expr doesn't support the `name` argument - name = {} - if not QUERY_PLANNING_ON: - name = { - "name": name - or ("from_cudf-" + tokenize(data, npartitions or chunksize)) - } - return dd.from_pandas( data, npartitions=npartitions, chunksize=chunksize, sort=sort, - **name, ) -from_cudf.__doc__ = ( - textwrap.dedent( - """ +from_cudf.__doc__ = textwrap.dedent( + """ Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. This function is a thin wrapper around @@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): arguments (described below) excepting that it operates on cuDF rather than pandas objects.\n """ - ) - # TODO: `dd.from_pandas.__doc__` is empty when - # `DASK_DATAFRAME__QUERY_PLANNING=True` - # since dask-expr does not provide a docstring for from_pandas. - + textwrap.dedent(dd.from_pandas.__doc__ or "") -) +) + textwrap.dedent(dd.from_pandas.__doc__) + + +def _deprecated_api(old_api, new_api=None, rec=None): + def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error + raise NotImplementedError( + f"{old_api} is no longer supported. " + (rec or "") + ) + + return inner_func diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 9bca33e414a..a5175c9bbe7 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from dask_cudf.core import _deprecated_api from . import csv, json, orc, parquet, text # noqa: F401 @@ -15,20 +15,13 @@ ) to_orc = _deprecated_api( "dask_cudf.io.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.orc.to_orc", rec="Please use the DataFrame.to_orc method instead.", ) read_text = _deprecated_api( "dask_cudf.io.read_text", new_api="dask_cudf.read_text" ) -if QUERY_PLANNING_ON: - read_parquet = parquet.read_parquet -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = parquet.read_parquet to_parquet = _deprecated_api( "dask_cudf.io.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 29f98b14511..e36ee04d827 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import os from glob import glob @@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs): >>> import dask_cudf >>> df = dask_cudf.read_csv("myfiles.*.csv") - In some cases it can break up large files: + It can break up large files if blocksize is specified: >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - It can read CSV files from external resources (e.g. S3, HTTP, FTP) + It can read CSV files from external resources (e.g. S3, HTTP, FTP): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") @@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs): ---------- path : str, path object, or file-like object Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as + ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3 + locations), or any object with a ``read()`` method (such as builtin :py:func:`open` file handler function or :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to + Passthrough keyword arguments that are sent to :func:`cudf:cudf.read_csv`. Notes diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 8f85ea54c0a..3022ebb2a5b 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,8 +1,209 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +from functools import partial -read_json = _deprecated_api( - "dask_cudf.io.json.read_json", - new_api="dask_cudf.read_json", -) +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5219cdacc31..5de28751912 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,13 +1,195 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_cudf import _deprecated_api - -read_orc = _deprecated_api( - "dask_cudf.io.orc.read_orc", - new_api="dask_cudf.read_orc", -) -to_orc = _deprecated_api( - "dask_cudf.io.orc.to_orc", - new_api="dask_cudf._legacy.io.orc.to_orc", - rec="Please use the DataFrame.to_orc method instead.", -) +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(source, fs, columns=None, kwargs=None): + """Pull out specific columns from specific stripe""" + path, stripe = source + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, _, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + sources = [] + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + sources.append((path, stripe)) + + return dd.from_map( + _read_orc_stripe, + sources, + args=[fs], + columns=columns, + kwargs=kwargs, + meta=meta, + ) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba6209c4820..a953dce787d 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -37,10 +37,9 @@ def TaskList(*x): import cudf -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api - # Dask-expr imports CudfEngine from this module from dask_cudf._legacy.io.parquet import CudfEngine +from dask_cudf.core import _deprecated_api if TYPE_CHECKING: from collections.abc import MutableMapping @@ -832,15 +831,8 @@ def read_parquet_expr( ) -if QUERY_PLANNING_ON: - read_parquet = read_parquet_expr - read_parquet.__doc__ = read_parquet_expr.__doc__ -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.parquet.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = read_parquet_expr +read_parquet.__doc__ = read_parquet_expr.__doc__ to_parquet = _deprecated_api( "dask_cudf.io.parquet.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index f5509cf91c3..48eca13e16f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import math import os @@ -11,10 +11,6 @@ from dask.utils import tmpfile import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): @@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path): with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): df2 = dask_cudf.io.read_json(path) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): - df2 = dask_cudf.io.json.read_json(path) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index b6064d851ca..4aac463420b 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import glob import os @@ -12,10 +12,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") @@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): df2 = dask_cudf.io.read_orc(paths) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): - df2 = dask_cudf.io.orc.read_orc(paths) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6efe6c4f388..9f7031f4d2a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import glob import math @@ -16,11 +16,6 @@ import dask_cudf from dask_cudf._legacy.io.parquet import create_metadata_file -from dask_cudf.tests.utils import ( - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) # Check if create_metadata_file is supported by # the current dask.dataframe version @@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on): dd.assert_eq(ddf1, ddf2) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @need_create_meta def test_create_metadata_file_inconsistent_schema(tmpdir): # NOTE: This test demonstrates that the CudfEngine @@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir): dd.assert_eq(df, new_ddf) -@skip_dask_expr("Not necessary in dask-expr") -def test_check_file_size(tmpdir): - # Test simple file-size check to help warn users - # of upstream change to `split_row_groups` default - fn = str(tmpdir.join("test.parquet")) - cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) - with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf._legacy.io` path - # TODO: Remove outdated `check_file_size` functionality - dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() - - -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning @@ -626,7 +607,6 @@ def test_timezone_column(tmpdir): dd.assert_eq(got, expect) -@require_dask_expr() @pytest.mark.skipif( not dask_cudf.backends.PYARROW_GE_15, reason="Requires pyarrow 15", @@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): dask_cudf.io.to_parquet(df, tmpdir) - if dask_cudf.QUERY_PLANNING_ON: - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - else: - with pytest.warns(match="legacy dask_cudf.io.read_parquet"): - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) - with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"): - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index 90907f6fb99..7c53b89a883 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import os import socket @@ -14,7 +14,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises(): pytest.param( "arrow", marks=pytest.mark.skipif( - not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + not dask_cudf.backends.PYARROW_GE_15, reason="Not supported", ), ), diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e35b6411a9d..f4d59334e03 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import os @@ -9,10 +9,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") @@ -42,7 +38,3 @@ def test_deprecated_api_paths(): with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): df2 = dask_cudf.io.read_text(text_file, delimiter=".") dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): - df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 1caf4e81d8e..eb1d007cc16 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,8 +1,56 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +import os +from glob import glob -read_text = _deprecated_api( - "dask_cudf.io.text.read_text", - new_api="dask_cudf.read_text", -) +import dask.dataframe as dd +from dask.utils import parse_bytes + +import cudf + + +def _read_text(source, **kwargs): + # Wrapper for cudf.read_text operation + fn, byte_range = source + return cudf.read_text(fn, byte_range=byte_range, **kwargs) + + +def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + if chunksize and byte_range: + raise ValueError("Cannot specify both chunksize and byte_range.") + + if chunksize: + sources = [] + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + byte_range = ( + start, + chunksize, + ) # specify which chunk of the file we care about + sources.append((fn, byte_range)) + else: + sources = [(fn, byte_range) for fn in filenames] + + return dd.from_map( + _read_text, + sources, + meta=cudf.Series([], dtype="O"), + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 3fbb2aacd2c..c6b01a648eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,7 +13,6 @@ from cudf.testing._utils import does_not_raise import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr ############################################################################# # Datetime Accessor # @@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7101fb7e00a..31957a106ff 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import random @@ -9,18 +9,12 @@ import dask from dask import dataframe as dd -from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty +from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty from dask.utils import M import cudf import dask_cudf -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) rng = np.random.default_rng(seed=0) @@ -299,37 +293,6 @@ def test_set_index_sorted(): gddf1.set_index("val", sorted=True) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -@pytest.mark.parametrize("index", [None, "myindex"]) -def test_rearrange_by_divisions(nelem, index): - with dask.config.set(scheduler="single-threaded"): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 20, size=nelem), - "y": rng.normal(size=nelem), - "z": rng.choice(["dog", "cat", "bird"], nelem), - } - ) - df["z"] = df["z"].astype("category") - - ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf( - cudf.DataFrame.from_pandas(df), npartitions=4 - ) - ddf1.index.name = index - gdf1.index.name = index - divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) - - expect = dd.shuffle.rearrange_by_divisions( - ddf1, "x", divisions=divisions, shuffle_method="tasks" - ) - result = dd.shuffle.rearrange_by_divisions( - gdf1, "x", divisions=divisions, shuffle_method="tasks" - ) - dd.assert_eq(expect, result) - - def test_assign(): rng = np.random.default_rng(seed=0) df = pd.DataFrame( @@ -393,44 +356,6 @@ def test_setitem_scalar_datetime(): np.testing.assert_array_equal(got["z"], df["z"]) -@skip_dask_expr("Not relevant for dask-expr") -@pytest.mark.parametrize( - "func", - [ - lambda: pd.DataFrame( - {"A": rng.random(10), "B": rng.random(10)}, - index=list("abcdefghij"), - ), - lambda: pd.DataFrame( - { - "A": rng.random(10), - "B": list("a" * 10), - "C": pd.Series( - [str(20090101 + i) for i in range(10)], - dtype="datetime64[ns]", - ), - }, - index=list("abcdefghij"), - ), - lambda: pd.Series(list("abcdefghijklmnop")), - lambda: pd.Series( - rng.random(10), - index=pd.Index( - [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" - ), - ), - ], -) -def test_repr(func): - pdf = func() - gdf = cudf.from_pandas(pdf) - gddf = dd.from_pandas(gdf, npartitions=3, sort=False) - - assert repr(gddf) - if hasattr(pdf, "_repr_html_"): - assert gddf._repr_html_() - - @pytest.mark.skip(reason="datetime indexes not fully supported in cudf") @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"]) @pytest.mark.parametrize("stop", ["1d", "3d", "8h"]) @@ -657,20 +582,20 @@ def test_hash_object_dispatch(index): ) # DataFrame - result = dd.core.hash_object_dispatch(obj, index=index) + result = dd.dispatch.hash_object_dispatch(obj, index=index) expected = dask_cudf.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series - result = dd.core.hash_object_dispatch(obj["x"], index=index) + result = dd.dispatch.hash_object_dispatch(obj["x"], index=index) expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) - result = dd.core.hash_object_dispatch(obj_multi, index=index) + result = dd.dispatch.hash_object_dispatch(obj_multi, index=index) expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) @@ -784,7 +709,6 @@ def test_dataframe_set_index(): assert_eq(ddf.compute(), pddf.compute()) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_series_describe(): random.seed(0) sr = cudf.datasets.randomdata(20)["x"] @@ -800,7 +724,6 @@ def test_series_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_dataframe_describe(): random.seed(0) df = cudf.datasets.randomdata(20) @@ -814,7 +737,6 @@ def test_dataframe_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_zero_std_describe(): num = 84886781 df = cudf.DataFrame( @@ -864,7 +786,7 @@ def test_merging_categorical_columns(): ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) - ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) + ddf_1 = ddf_1.categorize(columns=["cat_col"]) df_2 = cudf.DataFrame( {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} @@ -872,7 +794,7 @@ def test_merging_categorical_columns(): ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) - ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) + ddf_2 = ddf_2.categorize(columns=["cat_col"]) expected = cudf.DataFrame( { @@ -932,14 +854,9 @@ def func(x): result = ds.map_partitions(func, meta=s.values) - if QUERY_PLANNING_ON: - # Check Array and round-tripped DataFrame - dask.array.assert_eq(result, func(s)) - dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) - else: - # Legacy version still carries numpy metadata - # See: https://github.com/dask/dask/issues/11017 - dask.array.assert_eq(result.compute(), func(s)) + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) def test_implicit_array_conversion_cupy_sparse(): @@ -981,7 +898,6 @@ def test_series_isin_error(): ddf.isin([1, 5, "a"]).compute() -@require_dask_expr() def test_to_backend_simplify(): # Check that column projection is not blocked by to_backend with dask.config.set({"dataframe.backend": "pandas"}): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 9bd3b506db0..11ca0c6a783 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,12 +13,7 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - xfail_dask_expr, -) +from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized def assert_cudf_groupby_layers(ddf): @@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf): expect = getattr(gdf_grouped, aggregation)() actual = getattr(ddf_grouped, aggregation)() - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) if not series: expect = gdf_grouped.agg({"x": aggregation}) actual = ddf_grouped.agg({"x": aggregation}) - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) @@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf): check_dtype = aggregation != "count" - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - - # groupby.agg should add an explicit getitem layer - # to improve/enable column projection - assert hlg_layer(actual.dask, "getitem") - dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype) @@ -556,20 +538,13 @@ def test_groupby_categorical_key(): True, pytest.param( False, - marks=xfail_dask_expr("as_index not supported in dask-expr"), - ), - ], -) -@pytest.mark.parametrize( - "fused", - [ - True, - pytest.param( - False, - marks=require_dask_expr("Not supported by legacy API"), + marks=pytest.mark.xfail( + reason="as_index not supported in dask-expr" + ), ), ], ) +@pytest.mark.parametrize("fused", [True, False]) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) @@ -590,19 +565,16 @@ def test_groupby_agg_params( "c": ["mean", "std", "var"], } - fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} + fused_kwarg = {"fused": fused} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") # Avoid using as_index when query-planning is enabled - if QUERY_PLANNING_ON: - with pytest.warns(FutureWarning, match="argument is now deprecated"): - # Should warn when `as_index` is used - ddf.groupby(["name", "a"], sort=False, as_index=as_index) - maybe_as_index = {"as_index": as_index} if as_index is False else {} - else: - maybe_as_index = {"as_index": as_index} + with pytest.warns(FutureWarning, match="argument is now deprecated"): + # Should warn when `as_index` is used + ddf.groupby(["name", "a"], sort=False, as_index=as_index) + maybe_as_index = {"as_index": as_index} if as_index is False else {} # Check `sort=True` behavior if split_out == 1: @@ -671,7 +643,6 @@ def test_groupby_agg_params( dd.assert_eq(gf, pf) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) @@ -711,7 +682,6 @@ def test_is_supported(arg, supported): assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) gdf = cudf.from_pandas(df) @@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg): ) -@xfail_dask_expr("Co-alignment check fails in dask-expr") +@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr") def test_groupby_with_list_of_series(): df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) gdf = dask_cudf.from_cudf(df, npartitions=2) @@ -773,7 +743,6 @@ def test_groupby_with_list_of_series(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "func", [ @@ -833,7 +802,7 @@ def test_groupby_all_columns(func): expect = func(ddf) actual = func(gddf) - dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) + dd.assert_eq(expect, actual, check_names=False) def test_groupby_shuffle(): @@ -870,15 +839,3 @@ def test_groupby_shuffle(): # NOTE: `shuffle_method=True` should be default got = gddf.groupby("a", sort=False).agg(spec, split_out=2) dd.assert_eq(expect, got.compute().sort_index()) - - if not QUERY_PLANNING_ON: - # Sorted aggregation fails with split_out>1 when shuffle is False - # (sort=True, split_out=2, shuffle_method=False) - with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg( - spec, shuffle_method=False, split_out=2 - ) - - # Check shuffle kwarg deprecation - with pytest.warns(match="'shuffle' keyword is deprecated"): - gddf.groupby("a", sort=True).agg(spec, shuffle=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index 0b7c7855e07..2d05345bc4a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -8,12 +8,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr - -# No dask-expr support -pytestmark = xfail_dask_expr( - "Newer dask version needed", lt_version="2024.5.0" -) def test_get_dummies_cat(): diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 02c815427f3..68d6e72660e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -10,7 +10,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -67,7 +66,6 @@ def test_sort_repartition(): dd.assert_eq(len(new_ddf), len(ddf)) -@xfail_dask_expr("missing null support", lt_version="2024.5.1") @pytest.mark.parametrize("na_position", ["first", "last"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index b44b3f939e7..ef6765f39d1 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -1,22 +1,12 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd -import pytest -from packaging.version import Version -import dask import dask.dataframe as dd import cudf -from dask_cudf import QUERY_PLANNING_ON - -if QUERY_PLANNING_ON: - DASK_VERSION = Version(dask.__version__) -else: - DASK_VERSION = None - def _make_random_frame(nelem, npartitions=2, include_na=False): rng = np.random.default_rng(seed=0) @@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): gdf = cudf.DataFrame.from_pandas(df) dgf = dd.from_pandas(gdf, npartitions=npartitions) return df, dgf - - -_default_reason = "Not compatible with dask-expr" - - -def skip_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - skip = QUERY_PLANNING_ON - return pytest.mark.skipif(skip, reason=reason) - - -def xfail_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - xfail = QUERY_PLANNING_ON - return pytest.mark.xfail(xfail, reason=reason) - - -def require_dask_expr(reason="requires dask-expr"): - return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index a8cb696d7f6..b88816a3d47 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -39,10 +39,10 @@ classifiers = [ ] [project.entry-points."dask.dataframe.backends"] -cudf = "dask_cudf.backends:CudfBackendEntrypoint" +cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint" [project.entry-points."dask_expr.dataframe.backends"] -cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" +cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ @@ -102,8 +102,5 @@ filterwarnings = [ # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", - # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 - # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` - "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] xfail_strict = true diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 5f9a04d3cee..259492b98d1 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -34,9 +34,6 @@ endif() unset(cudf_FOUND) -# Find Python early so that later commands can use it -find_package(Python 3.10 REQUIRED COMPONENTS Interpreter) - set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd index 2d070ddda69..fbd478f963f 100644 --- a/python/pylibcudf/pylibcudf/hashing.pxd +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t @@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128( uint64_t seed=* ) +cpdef Column xxhash_32( + Table input, + uint32_t seed=* +) cpdef Column xxhash_64( Table input, diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi index a849f5d0729..d535d842a18 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyi +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int] def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_32(input: Table, seed: int = ...) -> Column: ... def xxhash_64(input: Table, seed: int = ...) -> Column: ... def md5(input: Table) -> Column: ... def sha1(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 548cffc0ce8..1f093b20c6b 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport ( sha256 as cpp_sha256, sha384 as cpp_sha384, sha512 as cpp_sha512, + xxhash_32 as cpp_xxhash_32, xxhash_64 as cpp_xxhash_64, ) from pylibcudf.libcudf.table.table cimport table @@ -30,6 +31,7 @@ __all__ = [ "sha256", "sha384", "sha512", + "xxhash_32", "xxhash_64", ] @@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128( return Table.from_libcudf(move(c_result)) +cpdef Column xxhash_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + cpdef Column xxhash_64( Table input, uint64_t seed=DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 4e8a01b41a5..46fdf62cd6b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_32( + const table_view& input, + const uint32_t seed + ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 83fb50fa4ef..7096dbe14ff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import hashlib import struct @@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0): def hash_combine_32(lhs, rhs): - return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2))) + return np.uint32( + int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32 + ) def uint_hash_combine_32(lhs, rhs): @@ -80,22 +82,6 @@ def list_struct_table(): return data -def python_hash_value(x, method): - if method == "murmurhash3_x86_32": - return libcudf_mmh3_x86_32(x) - elif method == "murmurhash3_x64_128": - hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) - hasher.update(x) - # libcudf returns a tuple of two 64-bit integers - return hasher.utupledigest() - elif method == "xxhash_64": - return xxhash.xxh64( - x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - ).intdigest() - else: - return getattr(hashlib, method)(x).hexdigest() - - @pytest.mark.parametrize( "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] ) @@ -115,6 +101,23 @@ def py_hasher(val): assert_column_eq(got, expect) +def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh32( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.xxhash_32( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + assert_column_eq(got, expect) + + def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( @@ -125,7 +128,9 @@ def py_hasher(val): [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], type=pa.uint64(), ) - got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + got = plc.hashing.xxhash_64( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) assert_column_eq(got, expect)