diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9dabe4e8800..354560998c5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -336,6 +336,7 @@ add_library( src/aggregation/result_cache.cpp src/ast/expression_parser.cpp src/ast/expressions.cpp + src/ast/operators.cpp src/binaryop/binaryop.cpp src/binaryop/compiled/ATan2.cu src/binaryop/compiled/Add.cu @@ -477,13 +478,13 @@ add_library( src/io/avro/reader_impl.cu src/io/comp/brotli_dict.cpp src/io/comp/comp.cpp + src/io/comp/comp.cu src/io/comp/cpu_unbz2.cpp src/io/comp/debrotli.cu src/io/comp/gpuinflate.cu src/io/comp/nvcomp_adapter.cpp src/io/comp/nvcomp_adapter.cu src/io/comp/snap.cu - src/io/comp/statistics.cu src/io/comp/uncomp.cpp src/io/comp/unsnap.cu src/io/csv/csv_gpu.cu @@ -515,6 +516,7 @@ add_library( src/datetime/timezone.cpp src/io/orc/writer_impl.cu src/io/parquet/arrow_schema_writer.cpp + src/io/parquet/bloom_filter_reader.cu src/io/parquet/compact_protocol_reader.cpp src/io/parquet/compact_protocol_writer.cpp src/io/parquet/decode_preprocess.cu diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp index a1b7db5e08a..2b2a660bed7 100644 --- a/cpp/include/cudf/aggregation.hpp +++ b/cpp/include/cudf/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -601,7 +601,7 @@ std::unique_ptr make_udf_aggregation(udf_type type, data_type output_type); // Forward declaration of `host_udf_base` for the factory function of `HOST_UDF` aggregation. -struct host_udf_base; +class host_udf_base; /** * @brief Factory to create a HOST_UDF aggregation. diff --git a/cpp/include/cudf/aggregation/host_udf.hpp b/cpp/include/cudf/aggregation/host_udf.hpp index bbce76dc5f3..451d75137e4 100644 --- a/cpp/include/cudf/aggregation/host_udf.hpp +++ b/cpp/include/cudf/aggregation/host_udf.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,18 +17,16 @@ #pragma once #include +#include #include #include #include -#include #include #include +#include #include -#include -#include -#include /** * @file host_udf.hpp @@ -43,49 +41,141 @@ namespace CUDF_EXPORT cudf { */ /** - * @brief The interface for host-based UDF implementation. + * @brief The fundamental interface for host-based UDF implementation. * - * An implementation of host-based UDF needs to be derived from this base class, defining - * its own version of the required functions. In particular: - * - The derived class is required to implement `get_empty_output`, `operator()`, `is_equal`, - * and `clone` functions. - * - If necessary, the derived class can also override `do_hash` to compute hashing for its - * instance, and `get_required_data` to selectively access to the input data as well as - * intermediate data provided by libcudf. + * This class declares the functions `do_hash`, `is_equal`, and `clone` that must be defined in + * the users' UDF implementation. These functions are required for libcudf aggregation framework + * to perform its operations. + */ +class host_udf_base { + // Declare constructor private to prevent the users from deriving from this class. + private: + host_udf_base() = default; ///< Default constructor + + // Only allow deriving from the structs below. + friend struct reduce_host_udf; + friend struct segmented_reduce_host_udf; + friend struct groupby_host_udf; + + public: + virtual ~host_udf_base() = default; ///< Default destructor + + /** + * @brief Computes hash value of the instance. + * + * Overriding this function is optional when the derived class has data members such that + * each instance needs to be differentiated from each other. + * + * @return The hash value of the instance + */ + [[nodiscard]] virtual std::size_t do_hash() const + { + return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + } + + /** + * @brief Compares two instances of the derived class for equality. + * @param other The other instance to compare with + * @return True if the two instances are equal + */ + [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + + /** + * @brief Clones the instance. + * + * The instances of the derived class should be lightweight for efficient cloning. + * + * @return A new instance cloned from this one + */ + [[nodiscard]] virtual std::unique_ptr clone() const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for reduction contexts. + * + * An implementation of host-based UDF for reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived classes must also define the `operator()` function to perform reduction + * operations. * - * Example of such implementation: + * Example: * @code{.cpp} - * struct my_udf_aggregation : cudf::host_udf_base { + * struct my_udf_aggregation : cudf::reduce_host_udf { * my_udf_aggregation() = default; * - * // This UDF aggregation needs `GROUPED_VALUES` and `GROUP_OFFSETS`, - * // and the result from groupby `MAX` aggregation. - * [[nodiscard]] data_attribute_set_t get_required_data() const override + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * data_type output_dtype, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override * { - * return {groupby_data_attribute::GROUPED_VALUES, - * groupby_data_attribute::GROUP_OFFSETS, - * cudf::make_max_aggregation()}; + * // Perform reduction computation using the input data and return the reduction result. + * // This is where the actual reduction logic is implemented. * } * - * [[nodiscard]] output_t get_empty_output( - * [[maybe_unused]] std::optional output_dtype, - * [[maybe_unused]] rmm::cuda_stream_view stream, - * [[maybe_unused]] rmm::device_async_resource_ref mr) const override + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { - * // This UDF aggregation always returns a column of type INT32. - * return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; * } * - * [[nodiscard]] output_t operator()(input_map_t const& input, - * rmm::cuda_stream_view stream, - * rmm::device_async_resource_ref mr) const override + * [[nodiscard]] std::unique_ptr clone() const override * { - * // Perform UDF computation using the input data and return the result. + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct reduce_host_udf : host_udf_base { + /** + * @brief Perform reduction operations. + * + * @param input The input column for reduction + * @param output_dtype The data type for the final output scalar + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation + */ + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; + +/** + * @brief The interface for host-based UDF implementation for segmented reduction context. + * + * An implementation of host-based UDF for segmented reduction needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such derived class must also define the `operator()` function to perform segmented reduction. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::segmented_reduce_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr operator()( + * column_view const& input, + * device_span offsets, + * data_type output_dtype, + * null_policy null_handling, + * std::optional> init, + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform computation using the input data and return the result. + * // This is where the actual segmented reduction logic is implemented. * } * * [[nodiscard]] bool is_equal(host_udf_base const& other) const override * { * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. * return dynamic_cast(&other) != nullptr; * } * @@ -96,198 +186,232 @@ namespace CUDF_EXPORT cudf { * }; * @endcode */ -struct host_udf_base { - host_udf_base() = default; - virtual ~host_udf_base() = default; - +struct segmented_reduce_host_udf : host_udf_base { /** - * @brief Define the possible data needed for groupby aggregations. + * @brief Perform segmented reduction operations. * - * Note that only sort-based groupby aggregations are supported. + * @param input The input column for reduction + * @param offsets A list of offsets defining the segments for reduction + * @param output_dtype The data type for the final output column + * @param null_handling If `INCLUDE` then the reduction result is valid only if all elements in + * the segment are valid, and if `EXCLUDE` then the reduction result is valid if any + * element in the segment is valid + * @param init The initial value of the reduction + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - enum class groupby_data_attribute : int32_t { - INPUT_VALUES, ///< The input values column. - GROUPED_VALUES, ///< The input values grouped according to the input `keys` for which the - ///< values within each group maintain their original order. - SORTED_GROUPED_VALUES, ///< The input values grouped according to the input `keys` and - ///< sorted within each group. - NUM_GROUPS, ///< The number of groups (i.e., number of distinct keys). - GROUP_OFFSETS, ///< The offsets separating groups. - GROUP_LABELS ///< Group labels (which is also the same as group indices). - }; + [[nodiscard]] virtual std::unique_ptr operator()( + column_view const& input, + device_span offsets, + data_type output_dtype, + null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const = 0; +}; +// Forward declaration. +namespace groupby ::detail { +struct aggregate_result_functor; +} + +/** + * @brief The interface for host-based UDF implementation for groupby aggregation context. + * + * An implementation of host-based UDF for groupby needs to be derived from this class. + * In addition to implementing the virtual functions declared in the base class `host_udf_base`, + * such a derived class must also define the functions `get_empty_output()` to return result when + * the input is empty, and ``operator()`` to perform its groupby operations. + * + * During execution, the derived class can access internal data provided by the libcudf groupby + * framework through a set of ``get*`` accessors, as well as calling other built-in groupby + * aggregations through the ``compute_aggregation`` function. + * + * @note The derived class can only perform sort-based groupby aggregations. Hash-based groupby + * aggregations require more complex data structure and is not yet supported. + * + * Example: + * @code{.cpp} + * struct my_udf_aggregation : cudf::groupby_host_udf { + * my_udf_aggregation() = default; + * + * [[nodiscard]] std::unique_ptr get_empty_output( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Return a column corresponding to the result when the input values column is empty. + * } + * + * [[nodiscard]] std::unique_ptr operator()( + * rmm::cuda_stream_view stream, + * rmm::device_async_resource_ref mr) const override + * { + * // Perform UDF computation using the input data and return the result. + * } + * + * [[nodiscard]] bool is_equal(host_udf_base const& other) const override + * { + * // Check if the other object is also instance of this class. + * // If there are internal state variables, they may need to be checked for equality as well. + * return dynamic_cast(&other) != nullptr; + * } + * + * [[nodiscard]] std::unique_ptr clone() const override + * { + * return std::make_unique(); + * } + * }; + * @endcode + */ +struct groupby_host_udf : host_udf_base { /** - * @brief Describe possible data that may be needed in the derived class for its operations. + * @brief Get the output when the input values column is empty. * - * Such data can be either intermediate data such as sorted values or group labels etc, or the - * results of other aggregations. + * This is called in libcudf when the input values column is empty. In such situations libcudf + * tries to generate the output directly without unnecessarily evaluating the intermediate data. * - * Each derived host-based UDF class may need a different set of data. It is inefficient to - * evaluate and pass down all these possible data at once from libcudf. A solution for that is, - * the derived class can define a subset of data that it needs and libcudf will evaluate - * and pass down only data requested from that set. + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation when the input values column is empty */ - struct data_attribute { - /** - * @brief Hold all possible data types for the input of the aggregation in the derived class. - */ - using value_type = std::variant>; - value_type value; ///< The actual data attribute, wrapped by this struct - ///< as a wrapper is needed to define `hash` and `equal_to` functors. - - data_attribute() = default; ///< Default constructor - data_attribute(data_attribute&&) = default; ///< Move constructor - - /** - * @brief Construct a new data attribute from an aggregation attribute. - * @param value_ An aggregation attribute - */ - template )> - data_attribute(T value_) : value{value_} - { - } - - /** - * @brief Construct a new data attribute from another aggregation request. - * @param value_ An aggregation request - */ - template || - std::is_same_v)> - data_attribute(std::unique_ptr value_) : value{std::move(value_)} - { - CUDF_EXPECTS(std::get>(value) != nullptr, - "Invalid aggregation request."); - if constexpr (std::is_same_v) { - CUDF_EXPECTS( - dynamic_cast(std::get>(value).get()) != nullptr, - "Requesting results from other aggregations is only supported in groupby " - "aggregations."); - } - } - - /** - * @brief Copy constructor. - * @param other The other data attribute to copy from - */ - data_attribute(data_attribute const& other); - - /** - * @brief Hash functor for `data_attribute`. - */ - struct hash { - /** - * @brief Compute the hash value of a data attribute. - * @param attr The data attribute to hash - * @return The hash value of the data attribute - */ - std::size_t operator()(data_attribute const& attr) const; - }; // struct hash - - /** - * @brief Equality comparison functor for `data_attribute`. - */ - struct equal_to { - /** - * @brief Check if two data attributes are equal. - * @param lhs The left-hand side data attribute - * @param rhs The right-hand side data attribute - * @return True if the two data attributes are equal - */ - bool operator()(data_attribute const& lhs, data_attribute const& rhs) const; - }; // struct equal_to - }; // struct data_attribute + [[nodiscard]] virtual std::unique_ptr get_empty_output( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; /** - * @brief Set of attributes for the input data that is needed for computing the aggregation. + * @brief Perform the main groupby computation for the host-based UDF. + * + * @param stream The CUDA stream to use for any kernel launches + * @param mr Device memory resource to use for any allocations + * @return The output result of the aggregation */ - using data_attribute_set_t = - std::unordered_set; + [[nodiscard]] virtual std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const = 0; + + private: + // Allow the struct `aggregate_result_functor` to set its private callback variables. + friend struct groupby::detail::aggregate_result_functor; /** - * @brief Return a set of attributes for the data that is needed for computing the aggregation. - * - * The derived class should return the attributes corresponding to only the data that it needs to - * avoid unnecessary computation performed in libcudf. If this function is not overridden, an - * empty set is returned. That means all the data attributes (except results from other - * aggregations in groupby) will be needed. - * - * @return A set of `data_attribute` + * @brief Callback to access the input values column. + */ + std::function callback_input_values; + + /** + * @brief Callback to access the input values grouped according to the input keys for which the + * values within each group maintain their original order. + */ + std::function callback_grouped_values; + + /** + * @brief Callback to access the input values grouped according to the input keys and sorted + * within each group. + */ + std::function callback_sorted_grouped_values; + + /** + * @brief Callback to access the number of groups (i.e., number of distinct keys). */ - [[nodiscard]] virtual data_attribute_set_t get_required_data() const { return {}; } + std::function callback_num_groups; /** - * @brief Hold all possible types of the data that is passed to the derived class for executing - * the aggregation. + * @brief Callback to access the offsets separating groups. */ - using input_data_t = std::variant>; + std::function(void)> callback_group_offsets; /** - * @brief Input to the aggregation, mapping from each data attribute to its actual data. + * @brief Callback to access the group labels (which is also the same as group indices). */ - using input_map_t = std:: - unordered_map; + std::function(void)> callback_group_labels; /** - * @brief Output type of the aggregation. + * @brief Callback to access the result from other groupby aggregations. + */ + std::function)> callback_compute_aggregation; + + protected: + /** + * @brief Access the input values column. * - * Currently only a single type is supported as the output of the aggregation, but it will hold - * more type in the future when reduction is supported. + * @return The input values column. */ - using output_t = std::variant>; + [[nodiscard]] column_view get_input_values() const + { + CUDF_EXPECTS(callback_input_values, "Uninitialized callback_input_values."); + return callback_input_values(); + } /** - * @brief Get the output when the input values column is empty. + * @brief Access the input values grouped according to the input keys for which the values + * within each group maintain their original order. * - * This is called in libcudf when the input values column is empty. In such situations libcudf - * tries to generate the output directly without unnecessarily evaluating the intermediate data. + * @return The grouped values column. + */ + [[nodiscard]] column_view get_grouped_values() const + { + CUDF_EXPECTS(callback_grouped_values, "Uninitialized callback_grouped_values."); + return callback_grouped_values(); + } + + /** + * @brief Access the input values grouped according to the input keys and sorted within each + * group. * - * @param output_dtype The expected output data type - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation when input values is empty + * @return The sorted grouped values column. */ - [[nodiscard]] virtual output_t get_empty_output(std::optional output_dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] column_view get_sorted_grouped_values() const + { + CUDF_EXPECTS(callback_sorted_grouped_values, "Uninitialized callback_sorted_grouped_values."); + return callback_sorted_grouped_values(); + } /** - * @brief Perform the main computation for the host-based UDF. + * @brief Access the number of groups (i.e., number of distinct keys). * - * @param input The input data needed for performing all computation - * @param stream The CUDA stream to use for any kernel launches - * @param mr Device memory resource to use for any allocations - * @return The output result of the aggregation + * @return The number of groups. */ - [[nodiscard]] virtual output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; + [[nodiscard]] size_type get_num_groups() const + { + CUDF_EXPECTS(callback_num_groups, "Uninitialized callback_num_groups."); + return callback_num_groups(); + } /** - * @brief Computes hash value of the class's instance. - * @return The hash value of the instance + * @brief Access the offsets separating groups. + * + * @return The array of group offsets. */ - [[nodiscard]] virtual std::size_t do_hash() const + [[nodiscard]] device_span get_group_offsets() const { - return std::hash{}(static_cast(aggregation::Kind::HOST_UDF)); + CUDF_EXPECTS(callback_group_offsets, "Uninitialized callback_group_offsets."); + return callback_group_offsets(); } /** - * @brief Compares two instances of the derived class for equality. - * @param other The other derived class's instance to compare with - * @return True if the two instances are equal + * @brief Access the group labels (which is also the same as group indices). + * + * @return The array of group labels. */ - [[nodiscard]] virtual bool is_equal(host_udf_base const& other) const = 0; + [[nodiscard]] device_span get_group_labels() const + { + CUDF_EXPECTS(callback_group_labels, "Uninitialized callback_group_labels."); + return callback_group_labels(); + } /** - * @brief Clones the instance. + * @brief Compute a built-in groupby aggregation and access its result. * - * A class derived from `host_udf_base` should not store too much data such that its instances - * remain lightweight for efficient cloning. + * This allows the derived class to call any other built-in groupby aggregations on the same input + * values column and access the output for its operations. * - * @return A new instance cloned from this + * @param other_agg An arbitrary built-in groupby aggregation + * @return A `column_view` object corresponding to the output result of the given aggregation */ - [[nodiscard]] virtual std::unique_ptr clone() const = 0; + [[nodiscard]] column_view compute_aggregation(std::unique_ptr other_agg) const + { + CUDF_EXPECTS(callback_compute_aggregation, "Uninitialized callback for computing aggregation."); + return callback_compute_aggregation(std::move(other_agg)); + } }; /** @} */ // end of group diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index 9d8762555d7..001b604814c 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -452,7 +452,7 @@ struct expression_evaluator { ++operator_index) { // Execute operator auto const op = plan.operators[operator_index]; - auto const arity = ast_operator_arity(op); + auto const arity = plan.operator_arities[operator_index]; if (arity == 1) { // Unary operator auto const& input = diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp index b5973d0ace9..d2e8c1cd41f 100644 --- a/cpp/include/cudf/ast/detail/expression_parser.hpp +++ b/cpp/include/cudf/ast/detail/expression_parser.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -88,6 +89,7 @@ struct expression_device_view { device_span data_references; device_span literals; device_span operators; + device_span operator_arities; device_span operator_source_indices; cudf::size_type num_intermediates; }; @@ -229,39 +231,55 @@ class expression_parser { * @param[in] v The `std::vector` containing components (operators, literals, etc). * @param[in,out] sizes The `std::vector` containing the size of each data buffer. * @param[in,out] data_pointers The `std::vector` containing pointers to each data buffer. + * @param[in,out] alignment The maximum alignment needed for all the extracted size and pointers */ template void extract_size_and_pointer(std::vector const& v, std::vector& sizes, - std::vector& data_pointers) + std::vector& data_pointers, + cudf::size_type& alignment) { + // sub-type alignment will only work provided the alignment is lesser or equal to + // alignof(max_align_t) which is the maximum alignment provided by rmm's device buffers + static_assert(alignof(T) <= alignof(max_align_t)); auto const data_size = sizeof(T) * v.size(); sizes.push_back(data_size); data_pointers.push_back(v.data()); + alignment = std::max(alignment, static_cast(alignof(T))); } void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { std::vector sizes; std::vector data_pointers; + // use a minimum of 4-byte alignment + cudf::size_type buffer_alignment = 4; - extract_size_and_pointer(_data_references, sizes, data_pointers); - extract_size_and_pointer(_literals, sizes, data_pointers); - extract_size_and_pointer(_operators, sizes, data_pointers); - extract_size_and_pointer(_operator_source_indices, sizes, data_pointers); + extract_size_and_pointer(_data_references, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_literals, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operators, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operator_arities, sizes, data_pointers, buffer_alignment); + extract_size_and_pointer(_operator_source_indices, sizes, data_pointers, buffer_alignment); // Create device buffer - auto const buffer_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); - auto buffer_offsets = std::vector(sizes.size()); - thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), buffer_offsets.begin(), 0); + auto buffer_offsets = std::vector(sizes.size()); + thrust::exclusive_scan(sizes.cbegin(), + sizes.cend(), + buffer_offsets.begin(), + cudf::size_type{0}, + [buffer_alignment](auto a, auto b) { + // align each component of the AST program + return cudf::util::round_up_safe(a + b, buffer_alignment); + }); + + auto const buffer_size = buffer_offsets.empty() ? 0 : (buffer_offsets.back() + sizes.back()); + auto host_data_buffer = std::vector(buffer_size); - auto h_data_buffer = std::vector(buffer_size); for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(h_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); + std::memcpy(host_data_buffer.data() + buffer_offsets[i], data_pointers[i], sizes[i]); } - _device_data_buffer = rmm::device_buffer(h_data_buffer.data(), buffer_size, stream, mr); - + _device_data_buffer = rmm::device_buffer(host_data_buffer.data(), buffer_size, stream, mr); stream.synchronize(); // Create device pointers to components of plan @@ -277,8 +295,11 @@ class expression_parser { device_expression_data.operators = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), _operators.size()); - device_expression_data.operator_source_indices = device_span( + device_expression_data.operator_arities = device_span( reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + _operators.size()); + device_expression_data.operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[4]), _operator_source_indices.size()); device_expression_data.num_intermediates = _intermediate_counter.get_max_used(); shmem_per_thread = static_cast( @@ -322,6 +343,7 @@ class expression_parser { bool _has_nulls; std::vector _data_references; std::vector _operators; + std::vector _operator_arities; std::vector _operator_source_indices; std::vector _literals; }; diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 46507700e21..db04e1fe989 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -69,159 +69,111 @@ constexpr bool is_valid_unary_op = cuda::std::is_invocable_v; * @param args Forwarded arguments to `operator()` of `f`. */ template -CUDF_HOST_DEVICE inline constexpr void ast_operator_dispatcher(ast_operator op, F&& f, Ts&&... args) +CUDF_HOST_DEVICE inline constexpr decltype(auto) ast_operator_dispatcher(ast_operator op, + F&& f, + Ts&&... args) { switch (op) { case ast_operator::ADD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SUB: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::MUL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TRUE_DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::FLOOR_DIV: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::MOD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::PYMOD: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::POW: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NOT_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LESS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::GREATER: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LESS_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::GREATER_EQUAL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BITWISE_XOR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOGICAL_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_LOGICAL_AND: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOGICAL_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NULL_LOGICAL_OR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::IDENTITY: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::IS_NULL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SIN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::COS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TAN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCSIN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCCOS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCTAN: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SINH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::COSH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::TANH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCSINH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCCOSH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ARCTANH: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::EXP: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::LOG: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::SQRT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CBRT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CEIL: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::FLOOR: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::ABS: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::RINT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::BIT_INVERT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::NOT: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_INT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_UINT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); case ast_operator::CAST_TO_FLOAT64: - f.template operator()(std::forward(args)...); - break; + return f.template operator()(std::forward(args)...); default: { #ifndef __CUDA_ARCH__ CUDF_FAIL("Invalid operator."); @@ -955,231 +907,6 @@ struct operator_functor { } }; -/** - * @brief Functor used to single-type-dispatch binary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are - * the same type, and dispatches based on the type of the left input. - * - * @tparam OperatorFunctor Binary operator functor. - */ -template -struct single_dispatch_binary_operator_types { - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation."); -#else - CUDF_UNREACHABLE("Invalid binary operation."); -#endif - } -}; - -/** - * @brief Functor performing a type dispatch for a binary operator. - * - * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true - * for all binary operators but holds for all currently implemented operators. - */ -struct type_dispatch_binary_op { - /** - * @brief Performs type dispatch for a binary operator. - * - * @tparam op AST operator. - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param lhs_type Type of left input data. - * @param rhs_type Type of right input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ - template - CUDF_HOST_DEVICE inline void operator()(cudf::data_type lhs_type, - cudf::data_type rhs_type, - F&& f, - Ts&&... args) - { - // Single dispatch (assume lhs_type == rhs_type) - type_dispatcher( - lhs_type, - // Always dispatch to the non-null operator for the purpose of type determination. - detail::single_dispatch_binary_operator_types>{}, - std::forward(f), - std::forward(args)...); - } -}; - -/** - * @brief Dispatches a runtime binary operator to a templated type dispatcher. - * - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param lhs_type Type of left input data. - * @param rhs_type Type of right input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ -template -CUDF_HOST_DEVICE inline constexpr void binary_operator_dispatcher( - ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) -{ - ast_operator_dispatcher(op, - detail::type_dispatch_binary_op{}, - lhs_type, - rhs_type, - std::forward(f), - std::forward(args)...); -} - -/** - * @brief Functor used to type-dispatch unary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_unary_op` trait. - * - * @tparam OperatorFunctor Unary operator functor. - */ -template -struct dispatch_unary_operator_types { - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid unary operation."); -#else - CUDF_UNREACHABLE("Invalid unary operation."); -#endif - } -}; - -/** - * @brief Functor performing a type dispatch for a unary operator. - */ -struct type_dispatch_unary_op { - template - CUDF_HOST_DEVICE inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args) - { - type_dispatcher( - input_type, - // Always dispatch to the non-null operator for the purpose of type determination. - detail::dispatch_unary_operator_types>{}, - std::forward(f), - std::forward(args)...); - } -}; - -/** - * @brief Dispatches a runtime unary operator to a templated type dispatcher. - * - * @tparam F Type of forwarded functor. - * @tparam Ts Parameter pack of forwarded arguments. - * @param input_type Type of input data. - * @param f Forwarded functor to be called. - * @param args Forwarded arguments to `operator()` of `f`. - */ -template -CUDF_HOST_DEVICE inline constexpr void unary_operator_dispatcher(ast_operator op, - cudf::data_type input_type, - F&& f, - Ts&&... args) -{ - ast_operator_dispatcher(op, - detail::type_dispatch_unary_op{}, - input_type, - std::forward(f), - std::forward(args)...); -} - -/** - * @brief Functor to determine the return type of an operator from its input types. - */ -struct return_type_functor { - /** - * @brief Callable for binary operators to determine return type. - * - * @tparam OperatorFunctor Operator functor to perform. - * @tparam LHS Left input type. - * @tparam RHS Right input type. - * @param result Reference whose value is assigned to the result data type. - */ - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { - using Out = cuda::std::invoke_result_t; - result = cudf::data_type(cudf::type_to_id()); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); -#else - CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined."); -#endif - } - - /** - * @brief Callable for unary operators to determine return type. - * - * @tparam OperatorFunctor Operator functor to perform. - * @tparam T Input type. - * @param result Pointer whose value is assigned to the result data type. - */ - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { - using Out = cuda::std::invoke_result_t; - result = cudf::data_type(cudf::type_to_id()); - } - - template >* = nullptr> - CUDF_HOST_DEVICE inline void operator()(cudf::data_type& result) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); -#else - CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined."); -#endif - } -}; - /** * @brief Gets the return type of an AST operator. * @@ -1187,34 +914,8 @@ struct return_type_functor { * @param operand_types Vector of input types to the operator. * @return cudf::data_type Return type of the operator. */ -inline cudf::data_type ast_operator_return_type(ast_operator op, - std::vector const& operand_types) -{ - auto result = cudf::data_type(cudf::type_id::EMPTY); - switch (operand_types.size()) { - case 1: - unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result); - break; - case 2: - binary_operator_dispatcher( - op, operand_types[0], operand_types[1], detail::return_type_functor{}, result); - break; - default: CUDF_FAIL("Unsupported operator return type."); break; - } - return result; -} - -/** - * @brief Functor to determine the arity (number of operands) of an operator. - */ -struct arity_functor { - template - CUDF_HOST_DEVICE inline void operator()(cudf::size_type& result) - { - // Arity is not dependent on null handling, so just use the false implementation here. - result = operator_functor::arity; - } -}; +cudf::data_type ast_operator_return_type(ast_operator op, + std::vector const& operand_types); /** * @brief Gets the arity (number of operands) of an AST operator. @@ -1222,12 +923,7 @@ struct arity_functor { * @param op Operator used to determine arity. * @return Arity of the operator. */ -CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op) -{ - auto result = cudf::size_type(0); - ast_operator_dispatcher(op, detail::arity_functor{}, result); - return result; -} +cudf::size_type ast_operator_arity(ast_operator op); } // namespace detail diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp index d873e93bd20..5574ed6ea6e 100644 --- a/cpp/include/cudf/detail/aggregation/aggregation.hpp +++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -967,7 +967,9 @@ class udf_aggregation final : public rolling_aggregation { /** * @brief Derived class for specifying host-based UDF aggregation. */ -class host_udf_aggregation final : public groupby_aggregation { +class host_udf_aggregation final : public groupby_aggregation, + public reduce_aggregation, + public segmented_reduce_aggregation { public: std::unique_ptr udf_ptr; diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp index 163fa20806d..82f7761da2e 100644 --- a/cpp/include/cudf/io/orc.hpp +++ b/cpp/include/cudf/io/orc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -578,7 +578,7 @@ class orc_writer_options { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use - compression_type _compression = compression_type::AUTO; + compression_type _compression = compression_type::SNAPPY; // Specify frequency of statistics collection statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) @@ -733,7 +733,11 @@ class orc_writer_options { * * @param comp Compression type */ - void set_compression(compression_type comp) { _compression = comp; } + void set_compression(compression_type comp) + { + _compression = comp; + if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; } + } /** * @brief Choose granularity of statistics collection. @@ -865,7 +869,7 @@ class orc_writer_options_builder { */ orc_writer_options_builder& compression(compression_type comp) { - options._compression = comp; + options.set_compression(comp); return *this; } @@ -1026,7 +1030,7 @@ class chunked_orc_writer_options { // Specify the sink to use for writer output sink_info _sink; // Specify the compression format to use - compression_type _compression = compression_type::AUTO; + compression_type _compression = compression_type::SNAPPY; // Specify granularity of statistics collection statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP; // Maximum size of each stripe (unless smaller than a single row group) @@ -1157,7 +1161,11 @@ class chunked_orc_writer_options { * * @param comp The compression type to use */ - void set_compression(compression_type comp) { _compression = comp; } + void set_compression(compression_type comp) + { + _compression = comp; + if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; } + } /** * @brief Choose granularity of statistics collection @@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder { */ chunked_orc_writer_options_builder& compression(compression_type comp) { - options._compression = comp; + options.set_compression(comp); return *this; } diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp deleted file mode 100644 index ae48fed4e79..00000000000 --- a/cpp/include/nvtext/detail/generate_ngrams.hpp +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include - -#include - -#include - -namespace CUDF_EXPORT nvtext { -namespace detail { - -/** - * @copydoc hash_character_ngrams(cudf::strings_column_view const&, - * cudf::size_type, rmm::device_async_resource_ref) - * - * @param stream CUDA stream used for allocating/copying device memory and launching kernels - */ -std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, - cudf::size_type ngrams, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace detail -} // namespace CUDF_EXPORT nvtext diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp index 54282b8ef3c..b2ba1798a8f 100644 --- a/cpp/include/nvtext/generate_ngrams.hpp +++ b/cpp/include/nvtext/generate_ngrams.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -117,6 +117,7 @@ std::unique_ptr generate_character_ngrams( * * @param input Strings column to produce ngrams from * @param ngrams The ngram number to generate. Default is 5. + * @param seed The seed value to use with the hash algorithm. Default is 0. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return A lists column of hash values @@ -124,6 +125,7 @@ std::unique_ptr generate_character_ngrams( std::unique_ptr hash_character_ngrams( cudf::strings_column_view const& input, cudf::size_type ngrams = 5, + uint32_t seed = 0, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/ast/expression_parser.cpp b/cpp/src/ast/expression_parser.cpp index d0e4c59ca54..b2cc134d9fa 100644 --- a/cpp/src/ast/expression_parser.cpp +++ b/cpp/src/ast/expression_parser.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,6 +161,7 @@ cudf::size_type expression_parser::visit(operation const& expr) auto const op = expr.get_operator(); auto const data_type = cudf::ast::detail::ast_operator_return_type(op, operand_types); _operators.push_back(op); + _operator_arities.push_back(cudf::ast::detail::ast_operator_arity(op)); // Push data reference auto const output = [&]() { if (expression_index == 0) { diff --git a/cpp/src/ast/operators.cpp b/cpp/src/ast/operators.cpp new file mode 100644 index 00000000000..b60a69a42d9 --- /dev/null +++ b/cpp/src/ast/operators.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (c) 2021-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include + +#include + +namespace cudf { +namespace ast { +namespace detail { +namespace { + +struct arity_functor { + template + void operator()(cudf::size_type& result) + { + // Arity is not dependent on null handling, so just use the false implementation here. + result = operator_functor::arity; + } +}; + +/** + * @brief Functor to determine the return type of an operator from its input types. + */ +struct return_type_functor { + /** + * @brief Callable for binary operators to determine return type. + * + * @tparam OperatorFunctor Operator functor to perform. + * @tparam LHS Left input type. + * @tparam RHS Right input type. + * @param result Pointer whose value is assigned to the result data type. + */ + template >* = nullptr> + void operator()(cudf::data_type& result) + { + using Out = cuda::std::invoke_result_t; + result = cudf::data_type{cudf::type_to_id()}; + } + + template >* = nullptr> + void operator()(cudf::data_type& result) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid binary operation. Return type cannot be determined."); +#else + CUDF_UNREACHABLE("Invalid binary operation. Return type cannot be determined."); +#endif + result = cudf::data_type{cudf::type_id::EMPTY}; + } + + /** + * @brief Callable for unary operators to determine return type. + * + * @tparam OperatorFunctor Operator functor to perform. + * @tparam T Input type. + * @param result Pointer whose value is assigned to the result data type. + */ + template >* = nullptr> + void operator()(cudf::data_type& result) + { + using Out = cuda::std::invoke_result_t; + result = cudf::data_type{cudf::type_to_id()}; + } + + template >* = nullptr> + void operator()(cudf::data_type& result) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid unary operation. Return type cannot be determined."); +#else + CUDF_UNREACHABLE("Invalid unary operation. Return type cannot be determined."); +#endif + result = cudf::data_type{cudf::type_id::EMPTY}; + } +}; + +/** + * @brief Functor used to single-type-dispatch binary operators. + * + * This functor's `operator()` is templated to validate calls to its operators based on the input + * type, as determined by the `is_valid_binary_op` trait. This function assumes that both inputs are + * the same type, and dispatches based on the type of the left input. + * + * @tparam OperatorFunctor Binary operator functor. + */ +template +struct single_dispatch_binary_operator_types { + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { + f.template operator()(std::forward(args)...); + } + + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid binary operation."); +#else + CUDF_UNREACHABLE("Invalid binary operation."); +#endif + } +}; + +/** + * @brief Functor performing a type dispatch for a binary operator. + * + * This functor performs single dispatch, which assumes lhs_type == rhs_type. This may not be true + * for all binary operators but holds for all currently implemented operators. + */ +struct type_dispatch_binary_op { + /** + * @brief Performs type dispatch for a binary operator. + * + * @tparam op AST operator. + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param lhs_type Type of left input data. + * @param rhs_type Type of right input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ + template + inline void operator()(cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) + { + // Single dispatch (assume lhs_type == rhs_type) + type_dispatcher( + lhs_type, + // Always dispatch to the non-null operator for the purpose of type determination. + detail::single_dispatch_binary_operator_types>{}, + std::forward(f), + std::forward(args)...); + } +}; + +/** + * @brief Dispatches a runtime binary operator to a templated type dispatcher. + * + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param lhs_type Type of left input data. + * @param rhs_type Type of right input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ +template +inline constexpr void binary_operator_dispatcher( + ast_operator op, cudf::data_type lhs_type, cudf::data_type rhs_type, F&& f, Ts&&... args) +{ + ast_operator_dispatcher(op, + detail::type_dispatch_binary_op{}, + lhs_type, + rhs_type, + std::forward(f), + std::forward(args)...); +} + +/** + * @brief Functor used to type-dispatch unary operators. + * + * This functor's `operator()` is templated to validate calls to its operators based on the input + * type, as determined by the `is_valid_unary_op` trait. + * + * @tparam OperatorFunctor Unary operator functor. + */ +template +struct dispatch_unary_operator_types { + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { + f.template operator()(std::forward(args)...); + } + + template >* = nullptr> + inline void operator()(F&& f, Ts&&... args) + { +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Invalid unary operation."); +#else + CUDF_UNREACHABLE("Invalid unary operation."); +#endif + } +}; + +/** + * @brief Functor performing a type dispatch for a unary operator. + */ +struct type_dispatch_unary_op { + template + inline void operator()(cudf::data_type input_type, F&& f, Ts&&... args) + { + type_dispatcher( + input_type, + // Always dispatch to the non-null operator for the purpose of type determination. + detail::dispatch_unary_operator_types>{}, + std::forward(f), + std::forward(args)...); + } +}; + +/** + * @brief Dispatches a runtime unary operator to a templated type dispatcher. + * + * @tparam F Type of forwarded functor. + * @tparam Ts Parameter pack of forwarded arguments. + * @param input_type Type of input data. + * @param f Forwarded functor to be called. + * @param args Forwarded arguments to `operator()` of `f`. + */ +template +inline constexpr void unary_operator_dispatcher(ast_operator op, + cudf::data_type input_type, + F&& f, + Ts&&... args) +{ + ast_operator_dispatcher(op, + detail::type_dispatch_unary_op{}, + input_type, + std::forward(f), + std::forward(args)...); +} + +} // namespace + +cudf::data_type ast_operator_return_type(ast_operator op, + std::vector const& operand_types) +{ + cudf::data_type result{cudf::type_id::EMPTY}; + switch (operand_types.size()) { + case 1: + unary_operator_dispatcher(op, operand_types[0], detail::return_type_functor{}, result); + break; + case 2: + binary_operator_dispatcher( + op, operand_types[0], operand_types[1], detail::return_type_functor{}, result); + break; + default: CUDF_FAIL("Unsupported operator return type."); break; + } + return result; +} + +cudf::size_type ast_operator_arity(ast_operator op) +{ + cudf::size_type result{}; + ast_operator_dispatcher(op, arity_functor{}, result); + return result; +} + +} // namespace detail + +} // namespace ast + +} // namespace cudf diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu index 4c90cd0eef5..6234148e9fa 100644 --- a/cpp/src/groupby/groupby.cu +++ b/cpp/src/groupby/groupby.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,8 +145,11 @@ struct empty_column_constructor { } if constexpr (k == aggregation::Kind::HOST_UDF) { - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - return std::get>(udf_ptr->get_empty_output(std::nullopt, stream, mr)); + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); + return udf_ptr->get_empty_output(stream, mr); } return make_empty_column(target_type(values.type(), k)); diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp index 6480070e85a..fb3f7559d64 100644 --- a/cpp/src/groupby/sort/aggregate.cpp +++ b/cpp/src/groupby/sort/aggregate.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -795,58 +795,41 @@ void aggregate_result_functor::operator()(aggregation con { if (cache.has_result(values, agg)) { return; } - auto const& udf_ptr = dynamic_cast(agg).udf_ptr; - auto const data_attrs = [&]() -> host_udf_base::data_attribute_set_t { - if (auto tmp = udf_ptr->get_required_data(); !tmp.empty()) { return tmp; } - // Empty attribute set means everything. - return {host_udf_base::groupby_data_attribute::INPUT_VALUES, - host_udf_base::groupby_data_attribute::GROUPED_VALUES, - host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - host_udf_base::groupby_data_attribute::NUM_GROUPS, - host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - host_udf_base::groupby_data_attribute::GROUP_LABELS}; - }(); + auto const& udf_base_ptr = dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for groupby aggregation."); - // Do not cache udf_input, as the actual input data may change from run to run. - host_udf_base::input_map_t udf_input; - for (auto const& attr : data_attrs) { - CUDF_EXPECTS(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value), - "Invalid input data attribute for HOST_UDF groupby aggregation."); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case host_udf_base::groupby_data_attribute::INPUT_VALUES: - udf_input.emplace(attr, values); - break; - case host_udf_base::groupby_data_attribute::GROUPED_VALUES: - udf_input.emplace(attr, get_grouped_values()); - break; - case host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES: - udf_input.emplace(attr, get_sorted_values()); - break; - case host_udf_base::groupby_data_attribute::NUM_GROUPS: - udf_input.emplace(attr, helper.num_groups(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_OFFSETS: - udf_input.emplace(attr, helper.group_offsets(stream)); - break; - case host_udf_base::groupby_data_attribute::GROUP_LABELS: - udf_input.emplace(attr, helper.group_labels(stream)); - break; - default: CUDF_UNREACHABLE("Invalid input data attribute for HOST_UDF groupby aggregation."); - } - } else { // data is result from another aggregation - auto other_agg = std::get>(attr.value)->clone(); + if (!udf_ptr->callback_input_values) { + udf_ptr->callback_input_values = [&]() -> column_view { return values; }; + } + if (!udf_ptr->callback_grouped_values) { + udf_ptr->callback_grouped_values = [&]() -> column_view { return get_grouped_values(); }; + } + if (!udf_ptr->callback_sorted_grouped_values) { + udf_ptr->callback_sorted_grouped_values = [&]() -> column_view { return get_sorted_values(); }; + } + if (!udf_ptr->callback_num_groups) { + udf_ptr->callback_num_groups = [&]() -> size_type { return helper.num_groups(stream); }; + } + if (!udf_ptr->callback_group_offsets) { + udf_ptr->callback_group_offsets = [&]() -> device_span { + return helper.group_offsets(stream); + }; + } + if (!udf_ptr->callback_group_labels) { + udf_ptr->callback_group_labels = [&]() -> device_span { + return helper.group_labels(stream); + }; + } + if (!udf_ptr->callback_compute_aggregation) { + udf_ptr->callback_compute_aggregation = + [&](std::unique_ptr other_agg) -> column_view { cudf::detail::aggregation_dispatcher(other_agg->kind, *this, *other_agg); - auto result = cache.get_result(values, *other_agg); - udf_input.emplace(std::move(other_agg), std::move(result)); - } + return cache.get_result(values, *other_agg); + }; } - auto output = (*udf_ptr)(udf_input, stream, mr); - CUDF_EXPECTS(std::holds_alternative>(output), - "Invalid output type from HOST_UDF groupby aggregation."); - cache.add_result(values, agg, std::get>(std::move(output))); + cache.add_result(values, agg, (*udf_ptr)(stream, mr)); } } // namespace detail diff --git a/cpp/src/groupby/sort/host_udf_aggregation.cpp b/cpp/src/groupby/sort/host_udf_aggregation.cpp index 0da47e17f48..6f1fe80c4bd 100644 --- a/cpp/src/groupby/sort/host_udf_aggregation.cpp +++ b/cpp/src/groupby/sort/host_udf_aggregation.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,51 +16,9 @@ #include #include -#include namespace cudf { -host_udf_base::data_attribute::data_attribute(data_attribute const& other) - : value{std::visit(cudf::detail::visitor_overload{[](auto const& val) { return value_type{val}; }, - [](std::unique_ptr const& val) { - return value_type{val->clone()}; - }}, - other.value)} -{ -} - -std::size_t host_udf_base::data_attribute::hash::operator()(data_attribute const& attr) const -{ - auto const hash_value = - std::visit(cudf::detail::visitor_overload{ - [](auto const& val) { return std::hash{}(static_cast(val)); }, - [](std::unique_ptr const& val) { return val->do_hash(); }}, - attr.value); - return std::hash{}(attr.value.index()) ^ hash_value; -} - -bool host_udf_base::data_attribute::equal_to::operator()(data_attribute const& lhs, - data_attribute const& rhs) const -{ - auto const& lhs_val = lhs.value; - auto const& rhs_val = rhs.value; - if (lhs_val.index() != rhs_val.index()) { return false; } - return std::visit( - cudf::detail::visitor_overload{ - [](auto const& lhs_val, auto const& rhs_val) { - if constexpr (std::is_same_v) { - return lhs_val == rhs_val; - } else { - return false; - } - }, - [](std::unique_ptr const& lhs_val, std::unique_ptr const& rhs_val) { - return lhs_val->is_equal(*rhs_val); - }}, - lhs_val, - rhs_val); -} - namespace detail { host_udf_aggregation::host_udf_aggregation(std::unique_ptr udf_ptr_) @@ -99,5 +57,9 @@ template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation); template CUDF_EXPORT std::unique_ptr make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); +template CUDF_EXPORT std::unique_ptr + make_host_udf_aggregation(std::unique_ptr); } // namespace cudf diff --git a/cpp/src/io/comp/comp.cpp b/cpp/src/io/comp/comp.cpp index 26535bed43b..3800835eaf1 100644 --- a/cpp/src/io/comp/comp.cpp +++ b/cpp/src/io/comp/comp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,22 +16,45 @@ #include "comp.hpp" +#include "gpuinflate.hpp" +#include "io/utilities/getenv_or.hpp" #include "io/utilities/hostdevice_vector.hpp" #include "nvcomp_adapter.hpp" #include #include +#include #include #include #include #include +#include #include // GZIP compression namespace cudf::io::detail { namespace { +auto& h_comp_pool() +{ + static std::size_t pool_size = + getenv_or("LIBCUDF_HOST_COMPRESSION_NUM_THREADS", std::thread::hardware_concurrency()); + static BS::thread_pool pool(pool_size); + return pool; +} + +std::optional to_nvcomp_compression(compression_type compression) +{ + switch (compression) { + case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY; + case compression_type::ZSTD: return nvcomp::compression_type::ZSTD; + case compression_type::LZ4: return nvcomp::compression_type::LZ4; + case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE; + default: return std::nullopt; + } +} + /** * @brief GZIP host compressor (includes header) */ @@ -98,8 +121,132 @@ std::vector compress_snappy(host_span src, return cudf::detail::make_std_vector_sync(d_dst, stream); } +void device_compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + if (compression == compression_type::NONE) { return; } + + auto const nvcomp_type = to_nvcomp_compression(compression); + auto nvcomp_disabled = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type) + : "invalid compression type"; + if (not nvcomp_disabled) { + return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream); + } + + switch (compression) { + case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream); + default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value()); + } +} + +void host_compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + if (compression == compression_type::NONE) { return; } + + auto const num_chunks = inputs.size(); + auto h_results = cudf::detail::make_host_vector(num_chunks, stream); + auto const h_inputs = cudf::detail::make_host_vector_async(inputs, stream); + auto const h_outputs = cudf::detail::make_host_vector_async(outputs, stream); + stream.synchronize(); + + std::vector> tasks; + auto const num_streams = + std::min({num_chunks, + cudf::detail::global_cuda_stream_pool().get_stream_pool_size(), + h_comp_pool().get_thread_count()}); + auto const streams = cudf::detail::fork_streams(stream, num_streams); + for (size_t i = 0; i < num_chunks; ++i) { + auto const cur_stream = streams[i % streams.size()]; + auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t { + auto const h_in = cudf::detail::make_host_vector_sync(d_in, cur_stream); + auto const h_out = compress(compression, h_in, cur_stream); + cudf::detail::cuda_memcpy(d_out.subspan(0, h_out.size()), h_out, cur_stream); + return h_out.size(); + }; + tasks.emplace_back(h_comp_pool().submit_task(std::move(task))); + } + + for (auto i = 0ul; i < num_chunks; ++i) { + h_results[i] = {tasks[i].get(), compression_status::SUCCESS}; + } + cudf::detail::cuda_memcpy_async(results, h_results, stream); +} + +[[nodiscard]] bool host_compression_supported(compression_type compression) +{ + switch (compression) { + case compression_type::GZIP: + case compression_type::NONE: return true; + default: return false; + } +} + +[[nodiscard]] bool device_compression_supported(compression_type compression) +{ + auto const nvcomp_type = to_nvcomp_compression(compression); + switch (compression) { + case compression_type::LZ4: + case compression_type::ZLIB: + case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value()); + case compression_type::SNAPPY: + case compression_type::NONE: return true; + default: return false; + } +} + +[[nodiscard]] bool use_host_compression( + compression_type compression, + [[maybe_unused]] device_span const> inputs, + [[maybe_unused]] device_span const> outputs) +{ + CUDF_EXPECTS( + not host_compression_supported(compression) or device_compression_supported(compression), + "Unsupported compression type"); + if (not host_compression_supported(compression)) { return false; } + if (not device_compression_supported(compression)) { return true; } + // If both host and device compression are supported, use the host if the env var is set + return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0); +} + } // namespace +std::optional compress_max_allowed_chunk_size(compression_type compression) +{ + if (auto nvcomp_type = to_nvcomp_compression(compression); + nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) { + return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type); + } + return std::nullopt; +} + +[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression) +{ + auto nvcomp_type = to_nvcomp_compression(compression); + if (compression == compression_type::NONE or not nvcomp_type.has_value() or + nvcomp::is_compression_disabled(*nvcomp_type)) { + return 1ul; + } + + return nvcomp::required_alignment(*nvcomp_type); +} + +[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size) +{ + if (compression == compression_type::NONE) { return uncompressed_size; } + + if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) { + return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size); + } + CUDF_FAIL("Unsupported compression type"); +} + std::vector compress(compression_type compression, host_span src, rmm::cuda_stream_view stream) @@ -112,4 +259,18 @@ std::vector compress(compression_type compression, } } +void compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + if (use_host_compression(compression, inputs, outputs)) { + return host_compress(compression, inputs, outputs, results, stream); + } else { + return device_compress(compression, inputs, outputs, results, stream); + } +} + } // namespace cudf::io::detail diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/comp.cu similarity index 96% rename from cpp/src/io/comp/statistics.cu rename to cpp/src/io/comp/comp.cu index caee9145d2c..af0f73869a2 100644 --- a/cpp/src/io/comp/statistics.cu +++ b/cpp/src/io/comp/comp.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "gpuinflate.hpp" +#include "comp.hpp" #include diff --git a/cpp/src/io/comp/comp.hpp b/cpp/src/io/comp/comp.hpp index e16f26e1f06..90932a11499 100644 --- a/cpp/src/io/comp/comp.hpp +++ b/cpp/src/io/comp/comp.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,5 +57,57 @@ std::vector compress(compression_type compression, host_span src, rmm::cuda_stream_view stream); +/** + * @brief Maximum size of uncompressed chunks that can be compressed. + * + * @param compression Compression type + * @returns maximum chunk size + */ +[[nodiscard]] std::optional compress_max_allowed_chunk_size(compression_type compression); + +/** + * @brief Gets input and output alignment requirements for the given compression type. + * + * @param compression Compression type + * @returns required alignment + */ +[[nodiscard]] size_t compress_required_chunk_alignment(compression_type compression); + +/** + * @brief Gets the maximum size any chunk could compress to in the batch. + * + * @param compression Compression type + * @param uncompressed_size Size of the largest uncompressed chunk in the batch + */ +[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size); + +/** + * @brief Compresses device memory buffers. + * + * @param compression Type of compression of the input data + * @param inputs Device memory buffers to compress + * @param outputs Device memory buffers to store the compressed output + * @param results Compression results + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void compress(compression_type compression, + device_span const> inputs, + device_span const> outputs, + device_span results, + rmm::cuda_stream_view stream); + +/** + * @brief Aggregate results of compression into a single statistics object. + * + * @param inputs List of uncompressed input buffers + * @param results List of compression results + * @param stream CUDA stream to use + * @return writer_compression_statistics + */ +[[nodiscard]] writer_compression_statistics collect_compression_statistics( + device_span const> inputs, + device_span results, + rmm::cuda_stream_view stream); + } // namespace io::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp index 4b09bd5a84c..0a35b230242 100644 --- a/cpp/src/io/comp/gpuinflate.hpp +++ b/cpp/src/io/comp/gpuinflate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -124,17 +124,4 @@ void gpu_snap(device_span const> inputs, device_span results, rmm::cuda_stream_view stream); -/** - * @brief Aggregate results of compression into a single statistics object. - * - * @param inputs List of uncompressed input buffers - * @param results List of compression results - * @param stream CUDA stream to use - * @return writer_compression_statistics - */ -[[nodiscard]] writer_compression_statistics collect_compression_statistics( - device_span const> inputs, - device_span results, - rmm::cuda_stream_view stream); - } // namespace cudf::io::detail diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 88423122e16..d63fa9f5c35 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l void parquet_writer_options_base::set_compression(compression_type compression) { _compression = compression; + if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; } } void parquet_writer_options_base::enable_int96_timestamps(bool req) diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp index f4e75f78dec..8b30cee6681 100644 --- a/cpp/src/io/orc/orc_gpu.hpp +++ b/cpp/src/io/orc/orc_gpu.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -407,7 +407,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, std::optional CompressOrcDataStreams( device_span compressed_data, uint32_t num_compressed_blocks, - CompressionKind compression, + compression_type compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, uint32_t comp_block_align, diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 79ecca0ca99..4f296bb5bfc 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,7 +15,6 @@ */ #include "io/comp/gpuinflate.hpp" -#include "io/comp/nvcomp_adapter.hpp" #include "io/utilities/block_utils.cuh" #include "io/utilities/time_utils.cuh" #include "orc_gpu.hpp" @@ -45,8 +44,6 @@ namespace io { namespace orc { namespace gpu { -namespace nvcomp = cudf::io::detail::nvcomp; - using cudf::detail::device_2dspan; using cudf::io::detail::compression_result; using cudf::io::detail::compression_status; @@ -1362,7 +1359,7 @@ void CompactOrcDataStreams(device_2dspan strm_desc, std::optional CompressOrcDataStreams( device_span compressed_data, uint32_t num_compressed_blocks, - CompressionKind compression, + compression_type compression, uint32_t comp_blk_size, uint32_t max_comp_blk_size, uint32_t comp_block_align, @@ -1387,47 +1384,7 @@ std::optional CompressOrcDataStreams( max_comp_blk_size, comp_block_align); - if (compression == SNAPPY) { - try { - if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { - cudf::io::detail::gpu_snap(comp_in, comp_out, comp_res, stream); - } else { - nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } - } catch (...) { - // There was an error in compressing so set an error status for each block - thrust::for_each( - rmm::exec_policy(stream), - comp_res.begin(), - comp_res.end(), - [] __device__(compression_result & stat) { stat.status = compression_status::FAILURE; }); - // Since SNAPPY is the default compression (may not be explicitly requested), fall back to - // writing without compression - CUDF_LOG_WARN("ORC writer: compression failed, writing uncompressed data"); - } - } else if (compression == ZLIB) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress( - nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream); - } else if (compression == ZSTD) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); - } else if (compression == LZ4) { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream); - } else if (compression != NONE) { - CUDF_FAIL("Unsupported compression type"); - } + cudf::io::detail::compress(compression, comp_in, comp_out, comp_res, stream); dim3 dim_block_compact(1024, 1); gpuCompactCompressedBlocks<<>>( diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ce868b83c04..aa0b509981a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ * @brief cuDF-IO ORC writer class implementation */ -#include "io/comp/nvcomp_adapter.hpp" #include "io/orc/orc_gpu.hpp" #include "io/statistics/column_statistics.cuh" #include "io/utilities/column_utils.cuh" @@ -71,8 +70,6 @@ namespace cudf::io::orc::detail { -namespace nvcomp = cudf::io::detail::nvcomp; - template [[nodiscard]] constexpr int varint_size(T val) { @@ -92,21 +89,8 @@ struct row_group_index_info { }; namespace { - /** - * @brief Translates ORC compression to nvCOMP compression - */ -auto to_nvcomp_compression_type(CompressionKind compression_kind) -{ - if (compression_kind == SNAPPY) return nvcomp::compression_type::SNAPPY; - if (compression_kind == ZLIB) return nvcomp::compression_type::DEFLATE; - if (compression_kind == ZSTD) return nvcomp::compression_type::ZSTD; - if (compression_kind == LZ4) return nvcomp::compression_type::LZ4; - CUDF_FAIL("Unsupported compression type"); -} - -/** - * @brief Translates cuDF compression to ORC compression + * @brief Translates cuDF compression to ORC compression. */ orc::CompressionKind to_orc_compression(compression_type compression) { @@ -122,19 +106,14 @@ orc::CompressionKind to_orc_compression(compression_type compression) } /** - * @brief Returns the block size for a given compression kind. + * @brief Returns the block size for a given compression format. */ -constexpr size_t compression_block_size(orc::CompressionKind compression) +size_t compression_block_size(compression_type compression) { - if (compression == orc::CompressionKind::NONE) { return 0; } - - auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) - ? std::nullopt - : nvcomp::compress_max_allowed_chunk_size(ncomp_type); + auto const comp_limit = compress_max_allowed_chunk_size(compression); constexpr size_t max_block_size = 256 * 1024; - return std::min(nvcomp_limit.value_or(max_block_size), max_block_size); + return std::min(comp_limit.value_or(max_block_size), max_block_size); } /** @@ -534,26 +513,6 @@ size_t RLE_stream_size(TypeKind kind, size_t count) } } -auto uncomp_block_alignment(CompressionKind compression_kind) -{ - if (compression_kind == NONE or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { - return 1ul; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind)); -} - -auto comp_block_alignment(CompressionKind compression_kind) -{ - if (compression_kind == NONE or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) { - return 1ul; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind)); -} - /** * @brief Builds up per-column streams. * @@ -566,7 +525,7 @@ orc_streams create_streams(host_span columns, file_segmentation const& segmentation, std::map const& decimal_column_sizes, bool enable_dictionary, - CompressionKind compression_kind, + compression_type compression, single_write_mode write_mode) { // 'column 0' row index stream @@ -610,7 +569,7 @@ orc_streams create_streams(host_span columns, auto add_stream = [&](gpu::StreamIndexType index_type, StreamKind kind, TypeKind type_kind, size_t size) { - auto const max_alignment_padding = uncomp_block_alignment(compression_kind) - 1; + auto const max_alignment_padding = compress_required_chunk_alignment(compression) - 1; const auto base = column.index() * gpu::CI_NUM_STREAMS; ids[base + index_type] = streams.size(); streams.push_back(orc::Stream{ @@ -1473,7 +1432,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, * @param[in] rg_stats row group level statistics * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams - * @param[in] compression_kind The compression kind + * @param[in] compression The compression format * @param[in] compression_blocksize The block size used for compression * @param[in] out_sink Sink for writing data */ @@ -1487,7 +1446,7 @@ void write_index_stream(int32_t stripe_id, host_span rg_stats, StripeInformation* stripe, orc_streams* streams, - CompressionKind compression_kind, + compression_type compression, size_t compression_blocksize, std::unique_ptr const& out_sink) { @@ -1501,7 +1460,7 @@ void write_index_stream(int32_t stripe_id, row_group_index_info record; if (stream.ids[type] > 0) { record.pos = 0; - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { auto const& ss = strm_desc[stripe_id][stream.ids[type] - (columns.size() + 1)]; record.blk_pos = ss.first_block; record.comp_pos = 0; @@ -1541,7 +1500,7 @@ void write_index_stream(int32_t stripe_id, } } - ProtobufWriter pbw((compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((compression != compression_type::NONE) ? 3 : 0); // Add row index entries auto const& rowgroups_range = segmentation.stripes[stripe_id]; @@ -1566,7 +1525,7 @@ void write_index_stream(int32_t stripe_id, }); (*streams)[stream_id].length = pbw.size(); - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { uint32_t uncomp_ix_len = (uint32_t)((*streams)[stream_id].length - 3) * 2 + 1; pbw.buffer()[0] = static_cast(uncomp_ix_len >> 0); pbw.buffer()[1] = static_cast(uncomp_ix_len >> 8); @@ -1585,7 +1544,7 @@ void write_index_stream(int32_t stripe_id, * @param[in,out] bounce_buffer Pinned memory bounce buffer for D2H data transfer * @param[in,out] stripe Stream's parent stripe * @param[in,out] streams List of all streams - * @param[in] compression_kind The compression kind + * @param[in] compression The compression format * @param[in] out_sink Sink for writing data * @param[in] stream CUDA stream used for device memory operations and kernel launches * @return An std::future that should be synchronized to ensure the writing is complete @@ -1596,7 +1555,7 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, host_span bounce_buffer, StripeInformation* stripe, orc_streams* streams, - CompressionKind compression_kind, + compression_type compression, std::unique_ptr const& out_sink, rmm::cuda_stream_view stream) { @@ -1606,8 +1565,9 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, return std::async(std::launch::deferred, [] {}); } - auto const* stream_in = (compression_kind == NONE) ? enc_stream.data_ptrs[strm_desc.stream_type] - : (compressed_data + strm_desc.bfr_offset); + auto const* stream_in = (compression == compression_type::NONE) + ? enc_stream.data_ptrs[strm_desc.stream_type] + : (compressed_data + strm_desc.bfr_offset); auto write_task = [&]() { if (out_sink->is_device_write_preferred(length)) { @@ -1627,15 +1587,15 @@ std::future write_data_stream(gpu::StripeStream const& strm_desc, /** * @brief Insert 3-byte uncompressed block headers in a byte vector * - * @param compression_kind The compression kind + * @param compression The compression kind * @param compression_blocksize The block size used for compression * @param v The destitation byte vector to write, which must include initial 3-byte header */ -void add_uncompressed_block_headers(CompressionKind compression_kind, +void add_uncompressed_block_headers(compression_type compression, size_t compression_blocksize, std::vector& v) { - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { size_t uncomp_len = v.size() - 3, pos = 0, block_len; while (uncomp_len > compression_blocksize) { block_len = compression_blocksize * 2 + 1; @@ -2021,14 +1981,6 @@ std::map decimal_column_sizes( return column_sizes; } -size_t max_compression_output_size(CompressionKind compression_kind, uint32_t compression_blocksize) -{ - if (compression_kind == NONE) return 0; - - return nvcomp::compress_max_output_chunk_size(to_nvcomp_compression_type(compression_kind), - compression_blocksize); -} - std::unique_ptr make_table_meta(table_view const& input) { auto table_meta = std::make_unique(input); @@ -2287,7 +2239,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, * @param row_index_stride The row index stride * @param enable_dictionary Whether dictionary is enabled * @param sort_dictionaries Whether to sort the dictionaries - * @param compression_kind The compression kind + * @param compression The compression format * @param compression_blocksize The block size used for compression * @param stats_freq Column statistics granularity type for parquet/orc writers * @param collect_compression_stats Flag to indicate if compression statistics should be collected @@ -2302,7 +2254,7 @@ auto convert_table_to_orc_data(table_view const& input, size_type row_index_stride, bool enable_dictionary, bool sort_dictionaries, - CompressionKind compression_kind, + compression_type compression, size_t compression_blocksize, statistics_freq stats_freq, bool collect_compression_stats, @@ -2329,17 +2281,16 @@ auto convert_table_to_orc_data(table_view const& input, auto stripe_dicts = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream); auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); - auto const uncompressed_block_align = uncomp_block_alignment(compression_kind); - auto const compressed_block_align = comp_block_alignment(compression_kind); + auto const block_align = compress_required_chunk_alignment(compression); auto streams = create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes), enable_dictionary, - compression_kind, + compression, write_mode); auto enc_data = encode_columns( - orc_table, std::move(dec_chunk_sizes), segmentation, streams, uncompressed_block_align, stream); + orc_table, std::move(dec_chunk_sizes), segmentation, streams, block_align, stream); stripe_dicts.on_encode_complete(stream); @@ -2371,16 +2322,15 @@ auto convert_table_to_orc_data(table_view const& input, size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; - auto const max_compressed_block_size = - max_compression_output_size(compression_kind, compression_blocksize); + auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize); auto const padded_max_compressed_block_size = - util::round_up_unsafe(max_compressed_block_size, compressed_block_align); + util::round_up_unsafe(max_compressed_block_size, block_align); auto const padded_block_header_size = - util::round_up_unsafe(block_header_size, compressed_block_align); + util::round_up_unsafe(block_header_size, block_align); for (auto& ss : strm_descs.host_view().flat_view()) { size_t stream_size = ss.stream_size; - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { ss.first_block = num_compressed_blocks; ss.bfr_offset = compressed_bfr_size; @@ -2401,14 +2351,14 @@ auto convert_table_to_orc_data(table_view const& input, comp_results.d_begin(), comp_results.d_end(), compression_result{0, compression_status::FAILURE}); - if (compression_kind != NONE) { + if (compression != compression_type::NONE) { strm_descs.host_to_device_async(stream); compression_stats = gpu::CompressOrcDataStreams(compressed_data, num_compressed_blocks, - compression_kind, + compression, compression_blocksize, max_compressed_block_size, - compressed_block_align, + block_align, collect_compression_stats, strm_descs, enc_data.streams, @@ -2459,8 +2409,8 @@ writer::impl::impl(std::unique_ptr sink, : _stream(stream), _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, _row_index_stride{options.get_row_index_stride()}, - _compression_kind(to_orc_compression(options.get_compression())), - _compression_blocksize(compression_block_size(_compression_kind)), + _compression{options.get_compression()}, + _compression_blocksize(compression_block_size(_compression)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), _sort_dictionaries{options.get_enable_dictionary_sort()}, @@ -2480,8 +2430,8 @@ writer::impl::impl(std::unique_ptr sink, : _stream(stream), _max_stripe_size{options.get_stripe_size_bytes(), options.get_stripe_size_rows()}, _row_index_stride{options.get_row_index_stride()}, - _compression_kind(to_orc_compression(options.get_compression())), - _compression_blocksize(compression_block_size(_compression_kind)), + _compression{options.get_compression()}, + _compression_blocksize(compression_block_size(_compression)), _compression_statistics(options.get_compression_statistics()), _stats_freq(options.get_statistics_freq()), _sort_dictionaries{options.get_enable_dictionary_sort()}, @@ -2526,7 +2476,7 @@ void writer::impl::write(table_view const& input) _row_index_stride, _enable_dictionary, _sort_dictionaries, - _compression_kind, + _compression, _compression_blocksize, _stats_freq, _compression_statistics != nullptr, @@ -2613,7 +2563,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, rg_stats, &stripe, &streams, - _compression_kind, + _compression, _compression_blocksize, _out_sink); } @@ -2627,7 +2577,7 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, bounce_buffer, &stripe, &streams, - _compression_kind, + _compression, _out_sink, _stream)); } @@ -2645,10 +2595,10 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data, : 0; if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; } } - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(sf); stripe.footerLength = pbw.size(); - if (_compression_kind != NONE) { + if (_compression != compression_type::NONE) { uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1; pbw.buffer()[0] = static_cast(uncomp_sf_len >> 0); pbw.buffer()[1] = static_cast(uncomp_sf_len >> 8); @@ -2780,21 +2730,21 @@ void writer::impl::close() // Write statistics metadata if (not _orc_meta.stripeStats.empty()) { - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(_orc_meta); - add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer()); + add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer()); ps.metadataLength = pbw.size(); _out_sink->host_write(pbw.data(), pbw.size()); } else { ps.metadataLength = 0; } - ProtobufWriter pbw((_compression_kind != NONE) ? 3 : 0); + ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0); pbw.write(_footer); - add_uncompressed_block_headers(_compression_kind, _compression_blocksize, pbw.buffer()); + add_uncompressed_block_headers(_compression, _compression_blocksize, pbw.buffer()); // Write postscript metadata ps.footerLength = pbw.size(); - ps.compression = _compression_kind; + ps.compression = to_orc_compression(_compression); ps.compressionBlockSize = _compression_blocksize; ps.version = {0, 12}; // Hive 0.12 ps.writerVersion = cudf_writer_version; diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index cae849ee315..7d23482cb17 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -342,7 +342,7 @@ class writer::impl { // Writer options. stripe_size_limits const _max_stripe_size; size_type const _row_index_stride; - CompressionKind const _compression_kind; + compression_type const _compression; size_t const _compression_blocksize; std::shared_ptr _compression_statistics; // Optional output statistics_freq const _stats_freq; diff --git a/cpp/src/io/parquet/bloom_filter_reader.cu b/cpp/src/io/parquet/bloom_filter_reader.cu new file mode 100644 index 00000000000..8c404950efa --- /dev/null +++ b/cpp/src/io/parquet/bloom_filter_reader.cu @@ -0,0 +1,683 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compact_protocol_reader.hpp" +#include "io/parquet/parquet.hpp" +#include "reader_impl_helpers.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::io::parquet::detail { +namespace { + +/** + * @brief Converts bloom filter membership results (for each column chunk) to a device column. + * + */ +struct bloom_filter_caster { + cudf::device_span const> bloom_filter_spans; + host_span parquet_types; + size_t total_row_groups; + size_t num_equality_columns; + + enum class is_int96_timestamp : bool { YES, NO }; + + template + std::unique_ptr query_bloom_filter(cudf::size_type equality_col_idx, + cudf::data_type dtype, + ast::literal const* const literal, + rmm::cuda_stream_view stream) const + { + using key_type = T; + using policy_type = cuco::arrow_filter_policy; + using word_type = typename policy_type::word_type; + + // Boolean, List, Struct, Dictionary types are not supported + if constexpr (std::is_same_v or + (cudf::is_compound() and not std::is_same_v)) { + CUDF_FAIL("Bloom filters do not support boolean or compound types"); + } else { + // Check if the literal has the same type as the predicate column + CUDF_EXPECTS( + dtype == literal->get_data_type() and + cudf::have_same_types( + cudf::column_view{dtype, 0, {}, {}, 0, 0, {}}, + cudf::scalar_type_t(T{}, false, stream, cudf::get_current_device_resource_ref())), + "Mismatched predicate column and literal types"); + } + + // Filter properties + auto constexpr bytes_per_block = sizeof(word_type) * policy_type::words_per_block; + + rmm::device_buffer results{total_row_groups, stream, cudf::get_current_device_resource_ref()}; + cudf::device_span results_span{static_cast(results.data()), total_row_groups}; + + // Query literal in bloom filters from each column chunk (row group). + thrust::tabulate( + rmm::exec_policy_nosync(stream), + results_span.begin(), + results_span.end(), + [filter_span = bloom_filter_spans.data(), + d_scalar = literal->get_value(), + col_idx = equality_col_idx, + num_equality_columns = num_equality_columns] __device__(auto row_group_idx) { + // Filter bitset buffer index + auto const filter_idx = col_idx + (num_equality_columns * row_group_idx); + auto const filter_size = filter_span[filter_idx].size(); + + // If no bloom filter, then fill in `true` as membership cannot be determined + if (filter_size == 0) { return true; } + + // Number of filter blocks + auto const num_filter_blocks = filter_size / bytes_per_block; + + // Create a bloom filter view. + cuco::bloom_filter_ref, + cuco::thread_scope_thread, + policy_type> + filter{reinterpret_cast(filter_span[filter_idx].data()), + num_filter_blocks, + {}, // Thread scope as the same literal is being searched across different bitsets + // per thread + {}}; // Arrow policy with cudf::hashing::detail::XXHash_64 seeded with 0 for Arrow + // compatibility + + // If int96_timestamp type, convert literal to string_view and query bloom + // filter + if constexpr (cuda::std::is_same_v and + IS_INT96_TIMESTAMP == is_int96_timestamp::YES) { + auto const int128_key = static_cast<__int128_t>(d_scalar.value()); + cudf::string_view probe_key{reinterpret_cast(&int128_key), 12}; + return filter.contains(probe_key); + } else { + // Query the bloom filter and store results + return filter.contains(d_scalar.value()); + } + }); + + return std::make_unique(cudf::data_type{cudf::type_id::BOOL8}, + static_cast(total_row_groups), + std::move(results), + rmm::device_buffer{}, + 0); + } + + // Creates device columns from bloom filter membership + template + std::unique_ptr operator()(cudf::size_type equality_col_idx, + cudf::data_type dtype, + ast::literal* const literal, + rmm::cuda_stream_view stream) const + { + // For INT96 timestamps, use cudf::string_view type and set is_int96_timestamp to YES + if constexpr (cudf::is_timestamp()) { + if (parquet_types[equality_col_idx] == Type::INT96) { + // For INT96 timestamps, use cudf::string_view type and set is_int96_timestamp to YES + return query_bloom_filter( + equality_col_idx, dtype, literal, stream); + } + } + + // For all other cases + return query_bloom_filter(equality_col_idx, dtype, literal, stream); + } +}; + +/** + * @brief Collects lists of equality predicate literals in the AST expression, one list per input + * table column. This is used in row group filtering based on bloom filters. + */ +class equality_literals_collector : public ast::detail::expression_transformer { + public: + equality_literals_collector() = default; + + equality_literals_collector(ast::expression const& expr, cudf::size_type num_input_columns) + : _num_input_columns{num_input_columns} + { + _equality_literals.resize(_num_input_columns); + expr.accept(*this); + } + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::literal const& ) + */ + std::reference_wrapper visit(ast::literal const& expr) override + { + return expr; + } + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) + */ + std::reference_wrapper visit(ast::column_reference const& expr) override + { + CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT, + "BloomfilterAST supports only left table"); + CUDF_EXPECTS(expr.get_column_index() < _num_input_columns, + "Column index cannot be more than number of columns in the table"); + return expr; + } + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) + */ + std::reference_wrapper visit( + ast::column_name_reference const& expr) override + { + CUDF_FAIL("Column name reference is not supported in BloomfilterAST"); + } + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) + */ + std::reference_wrapper visit(ast::operation const& expr) override + { + using cudf::ast::ast_operator; + auto const operands = expr.get_operands(); + auto const op = expr.get_operator(); + + if (auto* v = dynamic_cast(&operands[0].get())) { + // First operand should be column reference, second should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, + "Only binary operations are supported on column reference"); + auto const literal_ptr = dynamic_cast(&operands[1].get()); + CUDF_EXPECTS(literal_ptr != nullptr, + "Second operand of binary operation with column reference must be a literal"); + v->accept(*this); + + // Push to the corresponding column's literals list iff equality predicate is seen + if (op == ast_operator::EQUAL) { + auto const col_idx = v->get_column_index(); + _equality_literals[col_idx].emplace_back(const_cast(literal_ptr)); + } + } else { + // Just visit the operands and ignore any output + std::ignore = visit_operands(operands); + } + + return expr; + } + + /** + * @brief Vectors of equality literals in the AST expression, one per input table column + * + * @return Vectors of equality literals, one per input table column + */ + [[nodiscard]] std::vector> get_equality_literals() && + { + return std::move(_equality_literals); + } + + private: + std::vector> _equality_literals; + + protected: + std::vector> visit_operands( + cudf::host_span const> operands) + { + std::vector> transformed_operands; + for (auto const& operand : operands) { + auto const new_operand = operand.get().accept(*this); + transformed_operands.push_back(new_operand); + } + return transformed_operands; + } + size_type _num_input_columns; +}; + +/** + * @brief Converts AST expression to bloom filter membership (BloomfilterAST) expression. + * This is used in row group filtering based on equality predicate. + */ +class bloom_filter_expression_converter : public equality_literals_collector { + public: + bloom_filter_expression_converter( + ast::expression const& expr, + size_type num_input_columns, + cudf::host_span const> equality_literals) + : _equality_literals{equality_literals} + { + // Set the num columns + _num_input_columns = num_input_columns; + + // Compute and store columns literals offsets + _col_literals_offsets.reserve(_num_input_columns + 1); + _col_literals_offsets.emplace_back(0); + + std::transform(equality_literals.begin(), + equality_literals.end(), + std::back_inserter(_col_literals_offsets), + [&](auto const& col_literal_map) { + return _col_literals_offsets.back() + + static_cast(col_literal_map.size()); + }); + + // Add this visitor + expr.accept(*this); + } + + /** + * @brief Delete equality literals getter as it's not needed in the derived class + */ + [[nodiscard]] std::vector> get_equality_literals() && = delete; + + // Bring all overloads of `visit` from equality_predicate_collector into scope + using equality_literals_collector::visit; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) + */ + std::reference_wrapper visit(ast::operation const& expr) override + { + using cudf::ast::ast_operator; + auto const operands = expr.get_operands(); + auto const op = expr.get_operator(); + + if (auto* v = dynamic_cast(&operands[0].get())) { + // First operand should be column reference, second should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, + "Only binary operations are supported on column reference"); + CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + "Second operand of binary operation with column reference must be a literal"); + v->accept(*this); + + if (op == ast_operator::EQUAL) { + // Search the literal in this input column's equality literals list and add to the offset. + auto const col_idx = v->get_column_index(); + auto const& equality_literals = _equality_literals[col_idx]; + auto col_literal_offset = _col_literals_offsets[col_idx]; + auto const literal_iter = std::find(equality_literals.cbegin(), + equality_literals.cend(), + dynamic_cast(&operands[1].get())); + CUDF_EXPECTS(literal_iter != equality_literals.end(), "Could not find the literal ptr"); + col_literal_offset += std::distance(equality_literals.cbegin(), literal_iter); + + // Evaluate boolean is_true(value) expression as NOT(NOT(value)) + auto const& value = _bloom_filter_expr.push(ast::column_reference{col_literal_offset}); + _bloom_filter_expr.push(ast::operation{ + ast_operator::NOT, _bloom_filter_expr.push(ast::operation{ast_operator::NOT, value})}); + } + // For all other expressions, push an always true expression + else { + _bloom_filter_expr.push( + ast::operation{ast_operator::NOT, + _bloom_filter_expr.push(ast::operation{ast_operator::NOT, _always_true})}); + } + } else { + auto new_operands = visit_operands(operands); + if (cudf::ast::detail::ast_operator_arity(op) == 2) { + _bloom_filter_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); + } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _bloom_filter_expr.push(ast::operation{op, new_operands.front()}); + } + } + return _bloom_filter_expr.back(); + } + + /** + * @brief Returns the AST to apply on bloom filter membership. + * + * @return AST operation expression + */ + [[nodiscard]] std::reference_wrapper get_bloom_filter_expr() const + { + return _bloom_filter_expr.back(); + } + + private: + std::vector _col_literals_offsets; + cudf::host_span const> _equality_literals; + ast::tree _bloom_filter_expr; + cudf::numeric_scalar _always_true_scalar{true}; + ast::literal const _always_true{_always_true_scalar}; +}; + +/** + * @brief Reads bloom filter data to device. + * + * @param sources Dataset sources + * @param num_chunks Number of total column chunks to read + * @param bloom_filter_data Device buffers to hold bloom filter bitsets for each chunk + * @param bloom_filter_offsets Bloom filter offsets for all chunks + * @param bloom_filter_sizes Bloom filter sizes for all chunks + * @param chunk_source_map Association between each column chunk and its source + * @param stream CUDA stream used for device memory operations and kernel launches + */ +void read_bloom_filter_data(host_span const> sources, + size_t num_chunks, + cudf::host_span bloom_filter_data, + cudf::host_span> bloom_filter_offsets, + cudf::host_span> bloom_filter_sizes, + std::vector const& chunk_source_map, + rmm::cuda_stream_view stream) +{ + // Read tasks for bloom filter data + std::vector> read_tasks; + + // Read bloom filters for all column chunks + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(num_chunks), + [&](auto const chunk) { + // If bloom filter offset absent, fill in an empty buffer and skip ahead + if (not bloom_filter_offsets[chunk].has_value()) { + bloom_filter_data[chunk] = {}; + return; + } + // Read bloom filter iff present + auto const bloom_filter_offset = bloom_filter_offsets[chunk].value(); + + // If Bloom filter size (header + bitset) is available, just read the entire thing. + // Else just read 256 bytes which will contain the entire header and may contain the + // entire bitset as well. + auto constexpr bloom_filter_size_guess = 256; + auto const initial_read_size = + static_cast(bloom_filter_sizes[chunk].value_or(bloom_filter_size_guess)); + + // Read an initial buffer from source + auto& source = sources[chunk_source_map[chunk]]; + auto buffer = source->host_read(bloom_filter_offset, initial_read_size); + + // Deserialize the Bloom filter header from the buffer. + BloomFilterHeader header; + CompactProtocolReader cp{buffer->data(), buffer->size()}; + cp.read(&header); + + // Get the hardcoded words_per_block value from `cuco::arrow_filter_policy` using a temporary + // `std::byte` key type. + auto constexpr words_per_block = + cuco::arrow_filter_policy::words_per_block; + + // Check if the bloom filter header is valid. + auto const is_header_valid = + (header.num_bytes % words_per_block) == 0 and + header.compression.compression == BloomFilterCompression::Compression::UNCOMPRESSED and + header.algorithm.algorithm == BloomFilterAlgorithm::Algorithm::SPLIT_BLOCK and + header.hash.hash == BloomFilterHash::Hash::XXHASH; + + // Do not read if the bloom filter is invalid + if (not is_header_valid) { + bloom_filter_data[chunk] = {}; + CUDF_LOG_WARN("Encountered an invalid bloom filter header. Skipping"); + return; + } + + // Bloom filter header size + auto const bloom_filter_header_size = static_cast(cp.bytecount()); + auto const bitset_size = static_cast(header.num_bytes); + + // Check if we already read in the filter bitset in the initial read. + if (initial_read_size >= bloom_filter_header_size + bitset_size) { + bloom_filter_data[chunk] = + rmm::device_buffer{buffer->data() + bloom_filter_header_size, bitset_size, stream}; + } + // Read the bitset from datasource. + else { + auto const bitset_offset = bloom_filter_offset + bloom_filter_header_size; + // Directly read to device if preferred + if (source->is_device_read_preferred(bitset_size)) { + bloom_filter_data[chunk] = rmm::device_buffer{bitset_size, stream}; + auto future_read_size = + source->device_read_async(bitset_offset, + bitset_size, + static_cast(bloom_filter_data[chunk].data()), + stream); + + read_tasks.emplace_back(std::move(future_read_size)); + } else { + buffer = source->host_read(bitset_offset, bitset_size); + bloom_filter_data[chunk] = rmm::device_buffer{buffer->data(), buffer->size(), stream}; + } + } + }); + + // Read task sync function + for (auto& task : read_tasks) { + task.wait(); + } +} + +} // namespace + +std::vector aggregate_reader_metadata::read_bloom_filters( + host_span const> sources, + host_span const> row_group_indices, + host_span column_schemas, + size_type total_row_groups, + rmm::cuda_stream_view stream) const +{ + // Descriptors for all the chunks that make up the selected columns + auto const num_input_columns = column_schemas.size(); + auto const num_chunks = total_row_groups * num_input_columns; + + // Association between each column chunk and its source + std::vector chunk_source_map(num_chunks); + + // Keep track of column chunk file offsets + std::vector> bloom_filter_offsets(num_chunks); + std::vector> bloom_filter_sizes(num_chunks); + + // Gather all bloom filter offsets and sizes. + size_type chunk_count = 0; + + // Flag to check if we have at least one valid bloom filter offset + auto have_bloom_filters = false; + + // For all data sources + std::for_each(thrust::counting_iterator(0), + thrust::counting_iterator(row_group_indices.size()), + [&](auto const src_index) { + // Get all row group indices in the data source + auto const& rg_indices = row_group_indices[src_index]; + // For all row groups + std::for_each(rg_indices.cbegin(), rg_indices.cend(), [&](auto const rg_index) { + // For all column chunks + std::for_each( + column_schemas.begin(), column_schemas.end(), [&](auto const schema_idx) { + auto& col_meta = get_column_metadata(rg_index, src_index, schema_idx); + + // Get bloom filter offsets and sizes + bloom_filter_offsets[chunk_count] = col_meta.bloom_filter_offset; + bloom_filter_sizes[chunk_count] = col_meta.bloom_filter_length; + + // Set `have_bloom_filters` if `bloom_filter_offset` is valid + if (col_meta.bloom_filter_offset.has_value()) { have_bloom_filters = true; } + + // Map each column chunk to its source index + chunk_source_map[chunk_count] = src_index; + chunk_count++; + }); + }); + }); + + // Exit early if we don't have any bloom filters + if (not have_bloom_filters) { return {}; } + + // Vector to hold bloom filter data + std::vector bloom_filter_data(num_chunks); + + // Read bloom filter data + read_bloom_filter_data(sources, + num_chunks, + bloom_filter_data, + bloom_filter_offsets, + bloom_filter_sizes, + chunk_source_map, + stream); + + // Return bloom filter data + return bloom_filter_data; +} + +std::vector aggregate_reader_metadata::get_parquet_types( + host_span const> row_group_indices, + host_span column_schemas) const +{ + std::vector parquet_types(column_schemas.size()); + // Find a source with at least one row group + auto const src_iter = std::find_if(row_group_indices.begin(), + row_group_indices.end(), + [](auto const& rg) { return rg.size() > 0; }); + CUDF_EXPECTS(src_iter != row_group_indices.end(), ""); + + // Source index + auto const src_index = std::distance(row_group_indices.begin(), src_iter); + std::transform(column_schemas.begin(), + column_schemas.end(), + parquet_types.begin(), + [&](auto const schema_idx) { + // Use the first row group in this source + auto constexpr row_group_index = 0; + return get_column_metadata(row_group_index, src_index, schema_idx).type; + }); + + return parquet_types; +} + +std::optional>> aggregate_reader_metadata::apply_bloom_filters( + host_span const> sources, + host_span const> input_row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const +{ + // Number of input table columns + auto const num_input_columns = static_cast(output_dtypes.size()); + + // Total number of row groups after StatsAST filtration + auto const total_row_groups = std::accumulate( + input_row_group_indices.begin(), + input_row_group_indices.end(), + size_t{0}, + [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); + + // Check if we have less than 2B total row groups. + CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), + "Total number of row groups exceed the size_type's limit"); + + // Collect equality literals for each input table column + auto const equality_literals = + equality_literals_collector{filter.get(), num_input_columns}.get_equality_literals(); + + // Collect schema indices of columns with equality predicate(s) + std::vector equality_col_schemas; + thrust::copy_if(thrust::host, + output_column_schemas.begin(), + output_column_schemas.end(), + equality_literals.begin(), + std::back_inserter(equality_col_schemas), + [](auto& eq_literals) { return not eq_literals.empty(); }); + + // Return early if no column with equality predicate(s) + if (equality_col_schemas.empty()) { return std::nullopt; } + + // Read a vector of bloom filter bitset device buffers for all columns with equality + // predicate(s) across all row groups + auto bloom_filter_data = read_bloom_filters( + sources, input_row_group_indices, equality_col_schemas, total_row_groups, stream); + + // No bloom filter buffers, return the original row group indices + if (bloom_filter_data.empty()) { return std::nullopt; } + + // Get parquet types for the predicate columns + auto const parquet_types = get_parquet_types(input_row_group_indices, equality_col_schemas); + + // Create spans from bloom filter bitset buffers to use in cuco::bloom_filter_ref. + std::vector> h_bloom_filter_spans; + h_bloom_filter_spans.reserve(bloom_filter_data.size()); + std::transform(bloom_filter_data.begin(), + bloom_filter_data.end(), + std::back_inserter(h_bloom_filter_spans), + [&](auto& buffer) { + return cudf::device_span{ + static_cast(buffer.data()), buffer.size()}; + }); + + // Copy bloom filter bitset spans to device + auto const bloom_filter_spans = cudf::detail::make_device_uvector_async( + h_bloom_filter_spans, stream, cudf::get_current_device_resource_ref()); + + // Create a bloom filter query table caster + bloom_filter_caster const bloom_filter_col{ + bloom_filter_spans, parquet_types, total_row_groups, equality_col_schemas.size()}; + + // Converts bloom filter membership for equality predicate columns to a table + // containing a column for each `col[i] == literal` predicate to be evaluated. + // The table contains #sources * #column_chunks_per_src rows. + std::vector> bloom_filter_membership_columns; + size_t equality_col_idx = 0; + std::for_each( + thrust::counting_iterator(0), + thrust::counting_iterator(output_dtypes.size()), + [&](auto input_col_idx) { + auto const& dtype = output_dtypes[input_col_idx]; + + // Skip if no equality literals for this column + if (equality_literals[input_col_idx].empty()) { return; } + + // Skip if non-comparable (compound) type except string + if (cudf::is_compound(dtype) and dtype.id() != cudf::type_id::STRING) { return; } + + // Add a column for all literals associated with an equality column + for (auto const& literal : equality_literals[input_col_idx]) { + bloom_filter_membership_columns.emplace_back(cudf::type_dispatcher( + dtype, bloom_filter_col, equality_col_idx, dtype, literal, stream)); + } + equality_col_idx++; + }); + + // Create a table from columns + auto bloom_filter_membership_table = cudf::table(std::move(bloom_filter_membership_columns)); + + // Convert AST to BloomfilterAST expression with reference to bloom filter membership + // in above `bloom_filter_membership_table` + bloom_filter_expression_converter bloom_filter_expr{ + filter.get(), num_input_columns, {equality_literals}}; + + // Filter bloom filter membership table with the BloomfilterAST expression and collect + // filtered row group indices + return collect_filtered_row_group_indices(bloom_filter_membership_table, + bloom_filter_expr.get_bloom_filter_expr(), + input_row_group_indices, + stream); +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index f1ecf66c29f..b8e72aaac88 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -658,6 +658,33 @@ void CompactProtocolReader::read(ColumnChunk* c) function_builder(this, op); } +void CompactProtocolReader::read(BloomFilterAlgorithm* alg) +{ + auto op = std::make_tuple(parquet_field_union_enumerator(1, alg->algorithm)); + function_builder(this, op); +} + +void CompactProtocolReader::read(BloomFilterHash* hash) +{ + auto op = std::make_tuple(parquet_field_union_enumerator(1, hash->hash)); + function_builder(this, op); +} + +void CompactProtocolReader::read(BloomFilterCompression* comp) +{ + auto op = std::make_tuple(parquet_field_union_enumerator(1, comp->compression)); + function_builder(this, op); +} + +void CompactProtocolReader::read(BloomFilterHeader* bf) +{ + auto op = std::make_tuple(parquet_field_int32(1, bf->num_bytes), + parquet_field_struct(2, bf->algorithm), + parquet_field_struct(3, bf->hash), + parquet_field_struct(4, bf->compression)); + function_builder(this, op); +} + void CompactProtocolReader::read(ColumnChunkMetaData* c) { using optional_size_statistics = @@ -665,7 +692,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c) using optional_list_enc_stats = parquet_field_optional, parquet_field_struct_list>; - auto op = std::make_tuple(parquet_field_enum(1, c->type), + using optional_i64 = parquet_field_optional; + using optional_i32 = parquet_field_optional; + auto op = std::make_tuple(parquet_field_enum(1, c->type), parquet_field_enum_list(2, c->encodings), parquet_field_string_list(3, c->path_in_schema), parquet_field_enum(4, c->codec), @@ -677,6 +706,8 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c) parquet_field_int64(11, c->dictionary_page_offset), parquet_field_struct(12, c->statistics), optional_list_enc_stats(13, c->encoding_stats), + optional_i64(14, c->bloom_filter_offset), + optional_i32(15, c->bloom_filter_length), optional_size_statistics(16, c->size_statistics)); function_builder(this, op); } diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp index b87f2e9c692..360197b19ad 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.hpp +++ b/cpp/src/io/parquet/compact_protocol_reader.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -108,6 +108,10 @@ class CompactProtocolReader { void read(IntType* t); void read(RowGroup* r); void read(ColumnChunk* c); + void read(BloomFilterAlgorithm* bf); + void read(BloomFilterHash* bf); + void read(BloomFilterCompression* bf); + void read(BloomFilterHeader* bf); void read(ColumnChunkMetaData* c); void read(PageHeader* p); void read(DataPageHeader* d); diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index 2851ef67a65..dc0c4b1540e 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -382,12 +382,62 @@ struct ColumnChunkMetaData { // Set of all encodings used for pages in this column chunk. This information can be used to // determine if all data pages are dictionary encoded for example. std::optional> encoding_stats; + // Byte offset from beginning of file to Bloom filter data. + std::optional bloom_filter_offset; + // Size of Bloom filter data including the serialized header, in bytes. Added in 2.10 so readers + // may not read this field from old files and it can be obtained after the BloomFilterHeader has + // been deserialized. Writers should write this field so readers can read the bloom filter in a + // single I/O. + std::optional bloom_filter_length; // Optional statistics to help estimate total memory when converted to in-memory representations. // The histograms contained in these statistics can also be useful in some cases for more // fine-grained nullability/list length filter pushdown. std::optional size_statistics; }; +/** + * @brief The algorithm used in bloom filter + */ +struct BloomFilterAlgorithm { + // Block-based Bloom filter. + enum class Algorithm { UNDEFINED, SPLIT_BLOCK }; + Algorithm algorithm{Algorithm::SPLIT_BLOCK}; +}; + +/** + * @brief The hash function used in Bloom filter + */ +struct BloomFilterHash { + // xxHash_64 + enum class Hash { UNDEFINED, XXHASH }; + Hash hash{Hash::XXHASH}; +}; + +/** + * @brief The compression used in the bloom filter + */ +struct BloomFilterCompression { + enum class Compression { UNDEFINED, UNCOMPRESSED }; + Compression compression{Compression::UNCOMPRESSED}; +}; + +/** + * @brief Bloom filter header struct + * + * The bloom filter data of a column chunk stores this header at the beginning + * following by the filter bitset. + */ +struct BloomFilterHeader { + // The size of bitset in bytes + int32_t num_bytes; + // The algorithm for setting bits + BloomFilterAlgorithm algorithm; + // The hash function used for bloom filter + BloomFilterHash hash; + // The compression used in the bloom filter + BloomFilterCompression compression; +}; + /** * @brief Thrift-derived struct describing a chunk of data for a particular * column diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 9047ff9169b..0e307bac097 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -388,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer { } // namespace std::optional>> aggregate_reader_metadata::filter_row_groups( + host_span const> sources, host_span const> row_group_indices, host_span output_dtypes, host_span output_column_schemas, @@ -396,7 +398,6 @@ std::optional>> aggregate_reader_metadata::fi { auto mr = cudf::get_current_device_resource_ref(); // Create row group indices. - std::vector> filtered_row_group_indices; std::vector> all_row_group_indices; host_span const> input_row_group_indices; if (row_group_indices.empty()) { @@ -412,18 +413,22 @@ std::optional>> aggregate_reader_metadata::fi } else { input_row_group_indices = row_group_indices; } - auto const total_row_groups = std::accumulate(input_row_group_indices.begin(), - input_row_group_indices.end(), - 0, - [](size_type sum, auto const& per_file_row_groups) { - return sum + per_file_row_groups.size(); - }); + auto const total_row_groups = std::accumulate( + input_row_group_indices.begin(), + input_row_group_indices.end(), + size_t{0}, + [](size_t sum, auto const& per_file_row_groups) { return sum + per_file_row_groups.size(); }); + + // Check if we have less than 2B total row groups. + CUDF_EXPECTS(total_row_groups <= std::numeric_limits::max(), + "Total number of row groups exceed the size_type's limit"); // Converts Column chunk statistics to a table // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains #sources * #column_chunks_per_src rows. std::vector> columns; - stats_caster const stats_col{total_row_groups, per_file_metadata, input_row_group_indices}; + stats_caster const stats_col{ + static_cast(total_row_groups), per_file_metadata, input_row_group_indices}; for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) { auto const schema_idx = output_column_schemas[col_idx]; auto const& dtype = output_dtypes[col_idx]; @@ -452,44 +457,23 @@ std::optional>> aggregate_reader_metadata::fi CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8, "Filter expression must return a boolean column"); - auto const host_bitmask = [&] { - auto const num_bitmasks = num_bitmask_words(predicate.size()); - if (predicate.nullable()) { - return cudf::detail::make_host_vector_sync( - device_span(predicate.null_mask(), num_bitmasks), stream); - } else { - auto bitmask = cudf::detail::make_host_vector(num_bitmasks, stream); - std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0}); - return bitmask; - } - }(); + // Filter stats table with StatsAST expression and collect filtered row group indices + auto const filtered_row_group_indices = collect_filtered_row_group_indices( + stats_table, stats_expr.get_stats_expr(), input_row_group_indices, stream); - auto validity_it = cudf::detail::make_counting_transform_iterator( - 0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); }); + // Span of row groups to apply bloom filtering on. + auto const bloom_filter_input_row_groups = + filtered_row_group_indices.has_value() + ? host_span const>(filtered_row_group_indices.value()) + : input_row_group_indices; - auto const is_row_group_required = cudf::detail::make_host_vector_sync( - device_span(predicate.data(), predicate.size()), stream); + // Apply bloom filtering on the bloom filter input row groups + auto const bloom_filtered_row_groups = apply_bloom_filters( + sources, bloom_filter_input_row_groups, output_dtypes, output_column_schemas, filter, stream); - // Return only filtered row groups based on predicate - // if all are required or all are nulls, return. - if (std::all_of(is_row_group_required.cbegin(), - is_row_group_required.cend(), - [](auto i) { return bool(i); }) or - predicate.null_count() == predicate.size()) { - return std::nullopt; - } - size_type is_required_idx = 0; - for (auto const& input_row_group_index : input_row_group_indices) { - std::vector filtered_row_groups; - for (auto const rg_idx : input_row_group_index) { - if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) { - filtered_row_groups.push_back(rg_idx); - } - ++is_required_idx; - } - filtered_row_group_indices.push_back(std::move(filtered_row_groups)); - } - return {std::move(filtered_row_group_indices)}; + // Return bloom filtered row group indices iff collected + return bloom_filtered_row_groups.has_value() ? bloom_filtered_row_groups + : filtered_row_group_indices; } // convert column named expression to column index reference expression @@ -510,14 +494,14 @@ named_to_reference_converter::named_to_reference_converter( std::reference_wrapper named_to_reference_converter::visit( ast::literal const& expr) { - _stats_expr = std::reference_wrapper(expr); + _converted_expr = std::reference_wrapper(expr); return expr; } std::reference_wrapper named_to_reference_converter::visit( ast::column_reference const& expr) { - _stats_expr = std::reference_wrapper(expr); + _converted_expr = std::reference_wrapper(expr); return expr; } @@ -531,7 +515,7 @@ std::reference_wrapper named_to_reference_converter::visi } auto col_index = col_index_it->second; _col_ref.emplace_back(col_index); - _stats_expr = std::reference_wrapper(_col_ref.back()); + _converted_expr = std::reference_wrapper(_col_ref.back()); return std::reference_wrapper(_col_ref.back()); } @@ -546,7 +530,7 @@ std::reference_wrapper named_to_reference_converter::visi } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { _operators.emplace_back(op, new_operands.front()); } - _stats_expr = std::reference_wrapper(_operators.back()); + _converted_expr = std::reference_wrapper(_operators.back()); return std::reference_wrapper(_operators.back()); } @@ -640,4 +624,60 @@ class names_from_expression : public ast::detail::expression_transformer { return names_from_expression(expr, skip_names).to_vector(); } +std::optional>> collect_filtered_row_group_indices( + cudf::table_view table, + std::reference_wrapper ast_expr, + host_span const> input_row_group_indices, + rmm::cuda_stream_view stream) +{ + // Filter the input table using AST expression + auto predicate_col = cudf::detail::compute_column( + table, ast_expr.get(), stream, cudf::get_current_device_resource_ref()); + auto predicate = predicate_col->view(); + CUDF_EXPECTS(predicate.type().id() == cudf::type_id::BOOL8, + "Filter expression must return a boolean column"); + + auto const host_bitmask = [&] { + auto const num_bitmasks = num_bitmask_words(predicate.size()); + if (predicate.nullable()) { + return cudf::detail::make_host_vector_sync( + device_span(predicate.null_mask(), num_bitmasks), stream); + } else { + auto bitmask = cudf::detail::make_host_vector(num_bitmasks, stream); + std::fill(bitmask.begin(), bitmask.end(), ~bitmask_type{0}); + return bitmask; + } + }(); + + auto validity_it = cudf::detail::make_counting_transform_iterator( + 0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); }); + + // Return only filtered row groups based on predicate + auto const is_row_group_required = cudf::detail::make_host_vector_sync( + device_span(predicate.data(), predicate.size()), stream); + + // Return if all are required, or all are nulls. + if (predicate.null_count() == predicate.size() or std::all_of(is_row_group_required.cbegin(), + is_row_group_required.cend(), + [](auto i) { return bool(i); })) { + return std::nullopt; + } + + // Collect indices of the filtered row groups + size_type is_required_idx = 0; + std::vector> filtered_row_group_indices; + for (auto const& input_row_group_index : input_row_group_indices) { + std::vector filtered_row_groups; + for (auto const rg_idx : input_row_group_index) { + if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) { + filtered_row_groups.push_back(rg_idx); + } + ++is_required_idx; + } + filtered_row_group_indices.push_back(std::move(filtered_row_groups)); + } + + return {filtered_row_group_indices}; +} + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 0dd1aff41e9..25baa1e0ec8 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1030,6 +1030,7 @@ std::vector aggregate_reader_metadata::get_pandas_index_names() con std::tuple, std::vector> aggregate_reader_metadata::select_row_groups( + host_span const> sources, host_span const> row_group_indices, int64_t skip_rows_opt, std::optional const& num_rows_opt, @@ -1042,7 +1043,7 @@ aggregate_reader_metadata::select_row_groups( // if filter is not empty, then gather row groups to read after predicate pushdown if (filter.has_value()) { filtered_row_group_indices = filter_row_groups( - row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream); + sources, row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream); if (filtered_row_group_indices.has_value()) { row_group_indices = host_span const>(filtered_row_group_indices.value()); diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp index fd692c0cdd6..a28ce616e2c 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.hpp +++ b/cpp/src/io/parquet/reader_impl_helpers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -195,6 +195,38 @@ class aggregate_reader_metadata { */ void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const; + /** + * @brief Reads bloom filter bitsets for the specified columns from the given lists of row + * groups. + * + * @param sources Dataset sources + * @param row_group_indices Lists of row groups to read bloom filters from, one per source + * @param[out] bloom_filter_data List of bloom filter data device buffers + * @param column_schemas Schema indices of columns whose bloom filters will be read + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return A flattened list of bloom filter bitset device buffers for each predicate column across + * row group + */ + [[nodiscard]] std::vector read_bloom_filters( + host_span const> sources, + host_span const> row_group_indices, + host_span column_schemas, + size_type num_row_groups, + rmm::cuda_stream_view stream) const; + + /** + * @brief Collects Parquet types for the columns with the specified schema indices + * + * @param row_group_indices Lists of row groups, once per source + * @param column_schemas Schema indices of columns whose types will be collected + * + * @return A list of parquet types for the columns matching the provided schema indices + */ + [[nodiscard]] std::vector get_parquet_types( + host_span const> row_group_indices, + host_span column_schemas) const; + public: aggregate_reader_metadata(host_span const> sources, bool use_arrow_schema, @@ -323,26 +355,49 @@ class aggregate_reader_metadata { /** * @brief Filters the row groups based on predicate filter * + * @param sources Lists of input datasources * @param row_group_indices Lists of row groups to read, one per source - * @param output_dtypes Datatypes of of output columns + * @param output_dtypes Datatypes of output columns * @param output_column_schemas schema indices of output columns * @param filter AST expression to filter row groups based on Column chunk statistics * @param stream CUDA stream used for device memory operations and kernel launches - * @return Filtered row group indices, if any is filtered. + * @return Filtered row group indices, if any is filtered */ [[nodiscard]] std::optional>> filter_row_groups( + host_span const> sources, host_span const> row_group_indices, host_span output_dtypes, host_span output_column_schemas, std::reference_wrapper filter, rmm::cuda_stream_view stream) const; + /** + * @brief Filters the row groups using bloom filters + * + * @param sources Dataset sources + * @param row_group_indices Lists of input row groups to read, one per source + * @param output_dtypes Datatypes of output columns + * @param output_column_schemas schema indices of output columns + * @param filter AST expression to filter row groups based on bloom filter membership + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Filtered row group indices, if any is filtered + */ + [[nodiscard]] std::optional>> apply_bloom_filters( + host_span const> sources, + host_span const> input_row_group_indices, + host_span output_dtypes, + host_span output_column_schemas, + std::reference_wrapper filter, + rmm::cuda_stream_view stream) const; + /** * @brief Filters and reduces down to a selection of row groups * * The input `row_start` and `row_count` parameters will be recomputed and output as the valid * values based on the input row group list. * + * @param sources Lists of input datasources * @param row_group_indices Lists of row groups to read, one per source * @param row_start Starting row of the selection * @param row_count Total number of rows selected @@ -351,10 +406,11 @@ class aggregate_reader_metadata { * @param filter Optional AST expression to filter row groups based on Column chunk statistics * @param stream CUDA stream used for device memory operations and kernel launches * @return A tuple of corrected row_start, row_count, list of row group indexes and its - * starting row, and list of number of rows per source. + * starting row, and list of number of rows per source */ [[nodiscard]] std::tuple, std::vector> - select_row_groups(host_span const> row_group_indices, + select_row_groups(host_span const> sources, + host_span const> row_group_indices, int64_t row_start, std::optional const& row_count, host_span output_dtypes, @@ -413,14 +469,14 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::reference_wrapper visit(ast::operation const& expr) override; /** - * @brief Returns the AST to apply on Column chunk statistics. + * @brief Returns the converted AST expression * * @return AST operation expression */ [[nodiscard]] std::optional> get_converted_expr() const { - return _stats_expr; + return _converted_expr; } private: @@ -428,7 +484,7 @@ class named_to_reference_converter : public ast::detail::expression_transformer cudf::host_span const> operands); std::unordered_map column_name_to_index; - std::optional> _stats_expr; + std::optional> _converted_expr; // Using std::list or std::deque to avoid reference invalidation std::list _col_ref; std::list _operators; @@ -445,4 +501,22 @@ class named_to_reference_converter : public ast::detail::expression_transformer std::optional> expr, std::vector const& skip_names); +/** + * @brief Filter table using the provided (StatsAST or BloomfilterAST) expression and + * collect filtered row group indices + * + * @param table Table of stats or bloom filter membership columns + * @param ast_expr StatsAST or BloomfilterAST expression to filter with + * @param input_row_group_indices Lists of input row groups to read, one per source + * @param stream CUDA stream used for device memory operations and kernel launches + * + * @return Collected filtered row group indices, one vector per source, if any. A std::nullopt if + * all row groups are required or if the computed predicate is all nulls + */ +[[nodiscard]] std::optional>> collect_filtered_row_group_indices( + cudf::table_view ast_table, + std::reference_wrapper ast_expr, + host_span const> input_row_group_indices, + rmm::cuda_stream_view stream); + } // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 326232ced60..43666f9e42d 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1286,7 +1286,8 @@ void reader::impl::preprocess_file(read_mode mode) _file_itm_data.global_num_rows, _file_itm_data.row_groups, _file_itm_data.num_rows_per_source) = - _metadata->select_row_groups(_options.row_group_indices, + _metadata->select_row_groups(_sources, + _options.row_group_indices, _options.skip_rows, _options.num_rows, output_dtypes, diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 77924ac0f35..1b67b53ae8e 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -23,8 +23,7 @@ #include "compact_protocol_reader.hpp" #include "compact_protocol_writer.hpp" #include "interop/decimal_conversion_utilities.cuh" -#include "io/comp/gpuinflate.hpp" -#include "io/comp/nvcomp_adapter.hpp" +#include "io/comp/comp.hpp" #include "io/parquet/parquet.hpp" #include "io/parquet/parquet_gpu.hpp" #include "io/statistics/column_statistics.cuh" @@ -67,6 +66,20 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; +Compression to_parquet_compression(compression_type compression) +{ + switch (compression) { + case compression_type::AUTO: + case compression_type::SNAPPY: return Compression::SNAPPY; + case compression_type::ZSTD: return Compression::ZSTD; + case compression_type::LZ4: + // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 + return Compression::LZ4_RAW; + case compression_type::NONE: return Compression::UNCOMPRESSED; + default: CUDF_FAIL("Unsupported compression type"); + } +} + struct aggregate_writer_metadata { aggregate_writer_metadata(host_span partitions, host_span const> kv_md, @@ -1172,7 +1185,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, size_t max_page_size_bytes, size_type max_page_size_rows, bool write_v2_headers, - Compression compression_codec, + compression_type compression, rmm::cuda_stream_view stream) { if (chunks.is_empty()) { return cudf::detail::hostdevice_vector{}; } @@ -1187,7 +1200,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1212,7 +1225,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1221,12 +1234,10 @@ auto init_page_sizes(hostdevice_2dvector& chunks, // Get per-page max compressed size cudf::detail::hostdevice_vector comp_page_sizes(num_pages, stream); - std::transform(page_sizes.begin(), - page_sizes.end(), - comp_page_sizes.begin(), - [compression_codec](auto page_size) { - return max_compression_output_size(compression_codec, page_size); - }); + std::transform( + page_sizes.begin(), page_sizes.end(), comp_page_sizes.begin(), [compression](auto page_size) { + return max_compressed_size(compression, page_size); + }); comp_page_sizes.host_to_device_async(stream); // Use per-page max compressed size to calculate chunk.compressed_size @@ -1238,7 +1249,7 @@ auto init_page_sizes(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression_codec), + compress_required_chunk_alignment(compression), write_v2_headers, nullptr, nullptr, @@ -1247,16 +1258,13 @@ auto init_page_sizes(hostdevice_2dvector& chunks, return comp_page_sizes; } -size_t max_page_bytes(Compression compression, size_t max_page_size_bytes) +size_t max_page_bytes(compression_type compression, size_t max_page_size_bytes) { - if (compression == Compression::UNCOMPRESSED) { return max_page_size_bytes; } + if (compression == compression_type::NONE) { return max_page_size_bytes; } - auto const ncomp_type = to_nvcomp_compression_type(compression); - auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type) - ? std::nullopt - : nvcomp::compress_max_allowed_chunk_size(ncomp_type); + auto const comp_limit = compress_max_allowed_chunk_size(compression); - auto max_size = std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes); + auto max_size = std::min(comp_limit.value_or(max_page_size_bytes), max_page_size_bytes); // page size must fit in a 32-bit signed integer return std::min(max_size, std::numeric_limits::max()); } @@ -1265,7 +1273,7 @@ std::pair>, std::vector& chunks, host_span col_desc, device_2dspan frags, - Compression compression, + compression_type compression, dictionary_policy dict_policy, size_t max_dict_size, rmm::cuda_stream_view stream) @@ -1404,7 +1412,7 @@ build_chunk_dictionaries(hostdevice_2dvector& chunks, * @param num_columns Total number of columns * @param num_pages Total number of pages * @param num_stats_bfr Number of statistics buffers - * @param compression Compression format + * @param alignment Page alignment * @param max_page_size_bytes Maximum uncompressed page size, in bytes * @param max_page_size_rows Maximum page size, in rows * @param write_v2_headers True if version 2 page headers are to be written @@ -1419,7 +1427,7 @@ void init_encoder_pages(hostdevice_2dvector& chunks, uint32_t num_columns, uint32_t num_pages, uint32_t num_stats_bfr, - Compression compression, + size_t alignment, size_t max_page_size_bytes, size_type max_page_size_rows, bool write_v2_headers, @@ -1435,7 +1443,7 @@ void init_encoder_pages(hostdevice_2dvector& chunks, num_columns, max_page_size_bytes, max_page_size_rows, - page_alignment(compression), + alignment, write_v2_headers, (num_stats_bfr) ? page_stats_mrg.data() : nullptr, (num_stats_bfr > num_pages) ? page_stats_mrg.data() + num_pages : nullptr, @@ -1478,7 +1486,7 @@ void encode_pages(hostdevice_2dvector& chunks, statistics_chunk const* chunk_stats, statistics_chunk const* column_stats, std::optional& comp_stats, - Compression compression, + compression_type compression, int32_t column_index_truncate_length, bool write_v2_headers, rmm::cuda_stream_view stream) @@ -1488,7 +1496,7 @@ void encode_pages(hostdevice_2dvector& chunks, ? device_span(page_stats, num_pages) : device_span(); - uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0; + uint32_t max_comp_pages = (compression != compression_type::NONE) ? num_pages : 0; rmm::device_uvector> comp_in(max_comp_pages, stream); rmm::device_uvector> comp_out(max_comp_pages, stream); @@ -1499,34 +1507,7 @@ void encode_pages(hostdevice_2dvector& chunks, compression_result{0, compression_status::FAILURE}); EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream); - switch (compression) { - case Compression::SNAPPY: - if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) { - gpu_snap(comp_in, comp_out, comp_res, stream); - } else { - nvcomp::batched_compress( - nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream); - } - break; - case Compression::ZSTD: { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream); - break; - } - case Compression::LZ4_RAW: { - if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::LZ4); - reason) { - CUDF_FAIL("Compression error: " + reason.value()); - } - nvcomp::batched_compress(nvcomp::compression_type::LZ4, comp_in, comp_out, comp_res, stream); - break; - } - case Compression::UNCOMPRESSED: break; - default: CUDF_FAIL("invalid compression type"); - } + compress(compression, comp_in, comp_out, comp_res, stream); // TBD: Not clear if the official spec actually allows dynamically turning off compression at the // chunk-level @@ -1744,7 +1725,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type max_page_size_rows, int32_t column_index_truncate_length, statistics_freq stats_granularity, - Compression compression, + compression_type compression, bool collect_compression_statistics, dictionary_policy dict_policy, size_t max_dictionary_size, @@ -2146,7 +2127,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } // Clear compressed buffer size if compression has been turned off - if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; } + if (compression == compression_type::NONE) { max_comp_bfr_size = 0; } // Initialize data pointers uint32_t const num_stats_bfr = @@ -2214,7 +2195,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, num_columns, num_pages, num_stats_bfr, - compression, + compress_required_chunk_alignment(compression), max_page_size_bytes, max_page_size_rows, write_v2_headers, @@ -2270,7 +2251,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, auto const dev_bfr = ck.is_compressed ? ck.compressed_bfr : ck.uncompressed_bfr; auto& column_chunk_meta = row_group.columns[i].meta_data; - if (ck.is_compressed) { column_chunk_meta.codec = compression; } + if (ck.is_compressed) { column_chunk_meta.codec = to_parquet_compression(compression); } if (!out_sink[p]->is_device_write_preferred(ck.compressed_size)) { all_device_write = false; } @@ -2375,7 +2356,7 @@ writer::impl::impl(std::vector> sinks, single_write_mode mode, rmm::cuda_stream_view stream) : _stream(stream), - _compression(to_parquet_compression(options.get_compression())), + _compression(options.get_compression()), _max_row_group_size{options.get_row_group_size_bytes()}, _max_row_group_rows{options.get_row_group_size_rows()}, _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())), @@ -2406,7 +2387,7 @@ writer::impl::impl(std::vector> sinks, single_write_mode mode, rmm::cuda_stream_view stream) : _stream(stream), - _compression(to_parquet_compression(options.get_compression())), + _compression(options.get_compression()), _max_row_group_size{options.get_row_group_size_bytes()}, _max_row_group_rows{options.get_row_group_size_rows()}, _max_page_size_bytes(max_page_bytes(_compression, options.get_max_page_size_bytes())), diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp index 63128faf993..d5a5a534b93 100644 --- a/cpp/src/io/parquet/writer_impl.hpp +++ b/cpp/src/io/parquet/writer_impl.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -144,7 +144,7 @@ class writer::impl { rmm::cuda_stream_view _stream; // Writer options. - Compression const _compression; + compression_type const _compression; size_t const _max_row_group_size; size_type const _max_row_group_rows; size_t const _max_page_size_bytes; diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp index f15ea1f3c37..ede788c97c2 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.cpp +++ b/cpp/src/io/parquet/writer_impl_helpers.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,8 +21,6 @@ #include "writer_impl_helpers.hpp" -#include "io/comp/nvcomp_adapter.hpp" - #include #include #include @@ -32,48 +30,6 @@ namespace cudf::io::parquet::detail { using namespace cudf::io::detail; -Compression to_parquet_compression(compression_type compression) -{ - switch (compression) { - case compression_type::AUTO: - case compression_type::SNAPPY: return Compression::SNAPPY; - case compression_type::ZSTD: return Compression::ZSTD; - case compression_type::LZ4: - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - return Compression::LZ4_RAW; - case compression_type::NONE: return Compression::UNCOMPRESSED; - default: CUDF_FAIL("Unsupported compression type"); - } -} - -nvcomp::compression_type to_nvcomp_compression_type(Compression codec) -{ - switch (codec) { - case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY; - case Compression::ZSTD: return nvcomp::compression_type::ZSTD; - // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4 - case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4; - default: CUDF_FAIL("Unsupported compression type"); - } -} - -uint32_t page_alignment(Compression codec) -{ - if (codec == Compression::UNCOMPRESSED or - nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) { - return 1u; - } - - return nvcomp::required_alignment(to_nvcomp_compression_type(codec)); -} - -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize) -{ - if (codec == Compression::UNCOMPRESSED) return 0; - - return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize); -} - void fill_table_meta(table_input_metadata& table_meta) { // Fill unnamed columns' names in table_meta diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp index 14a9a0ed5b7..b5c73c348fe 100644 --- a/cpp/src/io/parquet/writer_impl_helpers.hpp +++ b/cpp/src/io/parquet/writer_impl_helpers.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,48 +20,12 @@ */ #pragma once -#include "parquet_common.hpp" #include #include -#include namespace cudf::io::parquet::detail { -/** - * @brief Function that translates GDF compression to parquet compression. - * - * @param compression The compression type - * @return The supported Parquet compression - */ -Compression to_parquet_compression(compression_type compression); - -/** - * @brief Function that translates the given compression codec to nvcomp compression type. - * - * @param codec Compression codec - * @return Translated nvcomp compression type - */ -cudf::io::detail::nvcomp::compression_type to_nvcomp_compression_type(Compression codec); - -/** - * @brief Function that computes input alignment requirements for the given compression type. - * - * @param codec Compression codec - * @return Required alignment - */ -uint32_t page_alignment(Compression codec); - -/** - * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the - * batch. - * - * @param codec Compression codec - * @param compression_blocksize Size of the largest uncompressed chunk in the batch - * @return Maximum compressed chunk size - */ -size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize); - /** * @brief Fill the table metadata with default column names. * diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index cea0ebad8f5..726feca328b 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -56,8 +56,7 @@ void set_up_kvikio() { static std::once_flag flag{}; std::call_once(flag, [] { - auto const compat_mode = - kvikio::detail::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON); + auto const compat_mode = kvikio::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON); kvikio::defaults::compat_mode_reset(compat_mode); auto const nthreads = getenv_or("KVIKIO_NTHREADS", 4u); diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp index 75ebc078930..928625a7e8f 100644 --- a/cpp/src/reductions/reductions.cpp +++ b/cpp/src/reductions/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -144,6 +145,13 @@ struct reduce_dispatch_functor { auto td_agg = static_cast(agg); return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr); } + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for reduction."); + return (*udf_ptr)(col, output_dtype, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported reduction operator"); } } @@ -161,9 +169,11 @@ std::unique_ptr reduce(column_view const& col, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } // Returns default scalar if input column is empty or all null diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp index 1c3a2b0c0f3..5835bfcf0a1 100644 --- a/cpp/src/reductions/segmented/reductions.cpp +++ b/cpp/src/reductions/segmented/reductions.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + +#include #include #include #include @@ -98,6 +100,13 @@ struct segmented_reduce_dispatch_functor { } case segmented_reduce_aggregation::NUNIQUE: return segmented_nunique(col, offsets, null_handling, stream, mr); + case aggregation::HOST_UDF: { + auto const& udf_base_ptr = + dynamic_cast(agg).udf_ptr; + auto const udf_ptr = dynamic_cast(udf_base_ptr.get()); + CUDF_EXPECTS(udf_ptr != nullptr, "Invalid HOST_UDF instance for segmented reduction."); + return (*udf_ptr)(col, offsets, output_dtype, null_handling, init, stream, mr); + } // case aggregation::HOST_UDF default: CUDF_FAIL("Unsupported aggregation type."); } } @@ -117,9 +126,11 @@ std::unique_ptr segmented_reduce(column_view const& segmented_values, cudf::data_type_error); if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT || agg.kind == aggregation::MIN || agg.kind == aggregation::MAX || - agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) { + agg.kind == aggregation::ANY || agg.kind == aggregation::ALL || + agg.kind == aggregation::HOST_UDF)) { CUDF_FAIL( - "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types"); + "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, ALL, and HOST_UDF " + "aggregation types"); } if (segmented_values.is_empty() && offsets.empty()) { diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu index 997b0278fe2..33d52ccd570 100644 --- a/cpp/src/text/generate_ngrams.cu +++ b/cpp/src/text/generate_ngrams.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ #include #include -#include +#include #include #include @@ -315,6 +315,7 @@ namespace { */ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings, cudf::size_type ngrams, + uint32_t seed, cudf::size_type const* d_ngram_offsets, cudf::hash_value_type* d_results) { @@ -332,7 +333,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st __shared__ cudf::hash_value_type hvs[block_size]; // temp store for hash values auto const ngram_offset = d_ngram_offsets[str_idx]; - auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{0}; + auto const hasher = cudf::hashing::detail::MurmurHash3_x86_32{seed}; auto const end = d_str.data() + d_str.size_bytes(); auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1; @@ -368,6 +369,7 @@ CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_st std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& input, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -400,7 +402,7 @@ std::unique_ptr hash_character_ngrams(cudf::strings_column_view co auto d_hashes = hashes->mutable_view().data(); character_ngram_hash_kernel<<>>( - *d_strings, ngrams, d_offsets, d_hashes); + *d_strings, ngrams, seed, d_offsets, d_hashes); return make_lists_column( input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr); @@ -419,11 +421,12 @@ std::unique_ptr generate_character_ngrams(cudf::strings_column_vie std::unique_ptr hash_character_ngrams(cudf::strings_column_view const& strings, cudf::size_type ngrams, + uint32_t seed, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return detail::hash_character_ngrams(strings, ngrams, stream, mr); + return detail::hash_character_ngrams(strings, ngrams, seed, stream, mr); } } // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 344979e1288..6a89b1e48d6 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -220,11 +220,12 @@ ConfigureTest( REDUCTIONS_TEST reductions/collect_ops_tests.cpp reductions/ewm_tests.cpp + reductions/host_udf_example_tests.cu + reductions/list_rank_test.cpp reductions/rank_tests.cpp reductions/reduction_tests.cpp reductions/scan_tests.cpp reductions/segmented_reduction_tests.cpp - reductions/list_rank_test.cpp reductions/tdigest_tests.cu GPUS 1 PERCENT 70 @@ -317,14 +318,15 @@ ConfigureTest( ) ConfigureTest( PARQUET_TEST - io/parquet_test.cpp + io/parquet_bloom_filter_test.cu io/parquet_chunked_reader_test.cu io/parquet_chunked_writer_test.cpp io/parquet_common.cpp io/parquet_misc_test.cpp io/parquet_reader_test.cpp - io/parquet_writer_test.cpp + io/parquet_test.cpp io/parquet_v2_test.cpp + io/parquet_writer_test.cpp GPUS 1 PERCENT 30 ) diff --git a/cpp/tests/groupby/host_udf_example_tests.cu b/cpp/tests/groupby/host_udf_example_tests.cu index a454bd692fc..e1ded37d8a7 100644 --- a/cpp/tests/groupby/host_udf_example_tests.cu +++ b/cpp/tests/groupby/host_udf_example_tests.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,9 +21,7 @@ #include #include #include -#include #include -#include #include #include @@ -34,6 +32,9 @@ #include #include +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; + namespace { /** * @brief A host-based UDF implementation for groupby. @@ -41,42 +42,21 @@ namespace { * For each group of values, the aggregation computes * `(group_idx + 1) * group_sum_of_squares - group_max * group_sum`. */ -struct host_udf_groupby_example : cudf::host_udf_base { +struct host_udf_groupby_example : cudf::groupby_host_udf { host_udf_groupby_example() = default; - [[nodiscard]] data_attribute_set_t get_required_data() const override - { - // We need grouped values, group offsets, group labels, and also results from groups' - // MAX and SUM aggregations. - return {groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS, - cudf::make_max_aggregation(), - cudf::make_sum_aggregation()}; - } - - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, - [[maybe_unused]] rmm::cuda_stream_view stream, - [[maybe_unused]] rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr get_empty_output( + rmm::cuda_stream_view, rmm::device_async_resource_ref) const override { return cudf::make_empty_column( cudf::data_type{cudf::type_to_id()}); } - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - return cudf::type_dispatcher(values.type(), groupby_fn{this}, input, stream, mr); - } - - [[nodiscard]] std::size_t do_hash() const override - { - // Just return the same hash for all instances of this class. - return std::size_t{12345}; + auto const values = get_grouped_values(); + return cudf::type_dispatcher(values.type(), groupby_fn{*this}, stream, mr); } [[nodiscard]] bool is_equal(host_udf_base const& other) const override @@ -92,37 +72,33 @@ struct host_udf_groupby_example : cudf::host_udf_base { struct groupby_fn { // Store pointer to the parent class so we can call its functions. - host_udf_groupby_example const* parent; + host_udf_groupby_example const& parent; - // For simplicity, this example only accepts double input and always produces double output. + // For simplicity, this example only accepts a single type input and output. using InputType = double; using OutputType = double; template )> - output_t operator()(Args...) const + std::unique_ptr operator()(Args...) const { CUDF_FAIL("Unsupported input type."); } template )> - output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const + std::unique_ptr operator()(rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - auto const& values = - std::get(input.at(groupby_data_attribute::GROUPED_VALUES)); - if (values.size() == 0) { return parent->get_empty_output(std::nullopt, stream, mr); } + auto const values = parent.get_grouped_values(); + if (values.size() == 0) { return parent.get_empty_output(stream, mr); } - auto const offsets = std::get>( - input.at(groupby_data_attribute::GROUP_OFFSETS)); + auto const offsets = parent.get_group_offsets(); CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); auto const num_groups = static_cast(offsets.size()) - 1; - auto const group_indices = std::get>( - input.at(groupby_data_attribute::GROUP_LABELS)); - auto const group_max = std::get( - input.at(cudf::make_max_aggregation())); - auto const group_sum = std::get( - input.at(cudf::make_sum_aggregation())); + auto const group_indices = parent.get_group_labels(); + auto const group_max = + parent.compute_aggregation(cudf::make_max_aggregation()); + auto const group_sum = + parent.compute_aggregation(cudf::make_sum_aggregation()); auto const values_dv_ptr = cudf::column_device_view::create(values, stream); auto const output = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, @@ -191,9 +167,6 @@ struct host_udf_groupby_example : cudf::host_udf_base { } // namespace -using doubles_col = cudf::test::fixed_width_column_wrapper; -using int32s_col = cudf::test::fixed_width_column_wrapper; - struct HostUDFGroupbyExampleTest : cudf::test::BaseFixture {}; TEST_F(HostUDFGroupbyExampleTest, SimpleInput) diff --git a/cpp/tests/groupby/host_udf_tests.cpp b/cpp/tests/groupby/host_udf_tests.cpp index 1a0f68c0c6c..17da28cdefc 100644 --- a/cpp/tests/groupby/host_udf_tests.cpp +++ b/cpp/tests/groupby/host_udf_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,178 +26,121 @@ #include namespace { + /** - * @brief A host-based UDF implementation used for unit tests. + * @brief Generate a random aggregation object from {min, max, sum, product}. */ -struct host_udf_test_base : cudf::host_udf_base { +std::unique_ptr get_random_agg() +{ + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distr(1, 4); + switch (distr(gen)) { + case 1: return cudf::make_min_aggregation(); + case 2: return cudf::make_max_aggregation(); + case 3: return cudf::make_sum_aggregation(); + case 4: return cudf::make_product_aggregation(); + default: CUDF_UNREACHABLE("This should not be reached."); + } + return nullptr; +} + +/** + * @brief A host-based UDF implementation used for unit tests for groupby aggregation. + */ +struct host_udf_groupby_test : cudf::groupby_host_udf { int test_location_line; // the location where testing is called bool* test_run; // to check if the test is accidentally skipped - data_attribute_set_t input_attrs; + bool test_other_agg; // test calling other aggregation - host_udf_test_base(int test_location_line_, bool* test_run_, data_attribute_set_t input_attrs_) - : test_location_line{test_location_line_}, - test_run{test_run_}, - input_attrs(std::move(input_attrs_)) + host_udf_groupby_test(int test_location_line_, bool* test_run_, bool test_other_agg_) + : test_location_line{test_location_line_}, test_run{test_run_}, test_other_agg{test_other_agg_} { } - [[nodiscard]] data_attribute_set_t get_required_data() const override { return input_attrs; } - - // This is the main testing function, which checks for the correctness of input data. - // The rests are just to satisfy the interface. - [[nodiscard]] output_t operator()(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override + [[nodiscard]] std::size_t do_hash() const override { return 0; } + [[nodiscard]] bool is_equal(host_udf_base const& other) const override { - SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - - test_data_attributes(input, stream, mr); - - *test_run = true; // test is run successfully - return get_empty_output(std::nullopt, stream, mr); + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(test_location_line, test_run, test_other_agg); } - [[nodiscard]] output_t get_empty_output( - [[maybe_unused]] std::optional output_dtype, + [[nodiscard]] std::unique_ptr get_empty_output( [[maybe_unused]] rmm::cuda_stream_view stream, [[maybe_unused]] rmm::device_async_resource_ref mr) const override { - // Unused function - dummy output. + // Dummy output. return cudf::make_empty_column(cudf::data_type{cudf::type_id::INT32}); } - [[nodiscard]] std::size_t do_hash() const override { return 0; } - [[nodiscard]] bool is_equal(host_udf_base const& other) const override { return true; } + [[nodiscard]] std::unique_ptr operator()( + rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const override + { + SCOPED_TRACE("Test instance created at line: " + std::to_string(test_location_line)); - // The main test function, which must be implemented for each kind of aggregations - // (groupby/reduction/segmented_reduction). - virtual void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const = 0; -}; + // Perform tests on types for the groupby data: we must ensure the data corresponding to each + // `groupby_data` enum having the correct type. -/** - * @brief A host-based UDF implementation used for unit tests for groupby aggregation. - */ -struct host_udf_groupby_test : host_udf_test_base { - host_udf_groupby_test(int test_location_line_, - bool* test_run_, - data_attribute_set_t input_attrs_ = {}) - : host_udf_test_base(test_location_line_, test_run_, std::move(input_attrs_)) - { - } + { + auto const inp_data = get_input_values(); + EXPECT_TRUE((std::is_same_v>)); + } - [[nodiscard]] std::unique_ptr clone() const override - { - return std::make_unique(test_location_line, test_run, input_attrs); - } + { + auto const inp_data = get_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); + } - void test_data_attributes(input_map_t const& input, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const override - { - data_attribute_set_t check_attrs = input_attrs; - if (check_attrs.empty()) { - check_attrs = data_attribute_set_t{groupby_data_attribute::INPUT_VALUES, - groupby_data_attribute::GROUPED_VALUES, - groupby_data_attribute::SORTED_GROUPED_VALUES, - groupby_data_attribute::NUM_GROUPS, - groupby_data_attribute::GROUP_OFFSETS, - groupby_data_attribute::GROUP_LABELS}; + { + auto const inp_data = get_sorted_grouped_values(); + EXPECT_TRUE((std::is_same_v>)); } - EXPECT_EQ(input.size(), check_attrs.size()); - for (auto const& attr : check_attrs) { - EXPECT_TRUE(input.count(attr) > 0); - EXPECT_TRUE(std::holds_alternative(attr.value) || - std::holds_alternative>(attr.value)); - if (std::holds_alternative(attr.value)) { - switch (std::get(attr.value)) { - case groupby_data_attribute::INPUT_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::SORTED_GROUPED_VALUES: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::NUM_GROUPS: - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - break; - case groupby_data_attribute::GROUP_OFFSETS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - case groupby_data_attribute::GROUP_LABELS: - EXPECT_TRUE( - std::holds_alternative>(input.at(attr))); - break; - default:; - } - } else { // std::holds_alternative>(attr.value) - EXPECT_TRUE(std::holds_alternative(input.at(attr))); - } + + { + auto const inp_data = get_num_groups(); + EXPECT_TRUE((std::is_same_v>)); } - } -}; -/** - * @brief Get a random subset of input data attributes. - */ -cudf::host_udf_base::data_attribute_set_t get_subset( - cudf::host_udf_base::data_attribute_set_t const& attrs) -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution size_distr(1, attrs.size() - 1); - auto const subset_size = size_distr(gen); - auto const elements = - std::vector(attrs.begin(), attrs.end()); - std::uniform_int_distribution idx_distr(0, attrs.size() - 1); - cudf::host_udf_base::data_attribute_set_t output; - while (output.size() < subset_size) { - output.insert(elements[idx_distr(gen)]); - } - return output; -} + { + auto const inp_data = get_group_offsets(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } -/** - * @brief Generate a random aggregation object from {min, max, sum, product}. - */ -std::unique_ptr get_random_agg() -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_int_distribution distr(1, 4); - switch (distr(gen)) { - case 1: return cudf::make_min_aggregation(); - case 2: return cudf::make_max_aggregation(); - case 3: return cudf::make_sum_aggregation(); - case 4: return cudf::make_product_aggregation(); - default: CUDF_UNREACHABLE("This should not be reached."); + { + auto const inp_data = get_group_labels(); + EXPECT_TRUE((std::is_same_v, + std::decay_t>)); + } + + // Perform tests on type of the result from computing other aggregations. + if (test_other_agg) { + auto const inp_data = compute_aggregation(get_random_agg()); + EXPECT_TRUE((std::is_same_v>)); + } + + *test_run = true; // test is run successfully + return get_empty_output(stream, mr); } - return nullptr; -} +}; } // namespace using int32s_col = cudf::test::fixed_width_column_wrapper; -// Number of randomly testing on the input data attributes. -// For each test, a subset of data attributes will be randomly generated from all the possible input -// data attributes. The input data corresponding to that subset passed from libcudf will be tested -// for correctness. -constexpr int NUM_RANDOM_TESTS = 20; - struct HostUDFTest : cudf::test::BaseFixture {}; -TEST_F(HostUDFTest, GroupbyAllInput) +TEST_F(HostUDFTest, GroupbyBuiltinInput) { bool test_run = false; auto const keys = int32s_col{0, 1, 2}; auto const vals = int32s_col{0, 1, 2}; auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run)); + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ false)); std::vector requests; requests.emplace_back(); @@ -205,28 +148,22 @@ TEST_F(HostUDFTest, GroupbyAllInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } -TEST_F(HostUDFTest, GroupbySomeInput) +TEST_F(HostUDFTest, GroupbyWithCallingOtherAggregations) { - auto const keys = int32s_col{0, 1, 2}; - auto const vals = int32s_col{0, 1, 2}; - auto const all_attrs = cudf::host_udf_base::data_attribute_set_t{ - cudf::host_udf_base::groupby_data_attribute::INPUT_VALUES, - cudf::host_udf_base::groupby_data_attribute::GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::SORTED_GROUPED_VALUES, - cudf::host_udf_base::groupby_data_attribute::NUM_GROUPS, - cudf::host_udf_base::groupby_data_attribute::GROUP_OFFSETS, - cudf::host_udf_base::groupby_data_attribute::GROUP_LABELS}; + auto const keys = int32s_col{0, 1, 2}; + auto const vals = int32s_col{0, 1, 2}; + + constexpr int NUM_RANDOM_TESTS = 20; + for (int i = 0; i < NUM_RANDOM_TESTS; ++i) { - bool test_run = false; - auto input_attrs = get_subset(all_attrs); - input_attrs.insert(get_random_agg()); - auto agg = cudf::make_host_udf_aggregation( - std::make_unique(__LINE__, &test_run, std::move(input_attrs))); + bool test_run = false; + auto agg = cudf::make_host_udf_aggregation( + std::make_unique(__LINE__, &test_run, /*test_other_agg*/ true)); std::vector requests; requests.emplace_back(); @@ -234,8 +171,8 @@ TEST_F(HostUDFTest, GroupbySomeInput) requests[0].aggregations.push_back(std::move(agg)); cudf::groupby::groupby gb_obj( cudf::table_view({keys}), cudf::null_policy::INCLUDE, cudf::sorted::NO, {}, {}); - [[maybe_unused]] auto const grp_result = - gb_obj.aggregate(requests, cudf::test::get_default_stream()); + [[maybe_unused]] auto const grp_result = gb_obj.aggregate( + requests, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()); EXPECT_TRUE(test_run); } } diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 2209a30149d..708c2045a74 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2068,6 +2068,7 @@ TEST_P(OrcCompressionTest, Basic) INSTANTIATE_TEST_CASE_P(OrcCompressionTest, OrcCompressionTest, ::testing::Values(cudf::io::compression_type::NONE, + cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, cudf::io::compression_type::ZSTD)); diff --git a/cpp/tests/io/parquet_bloom_filter_test.cu b/cpp/tests/io/parquet_bloom_filter_test.cu new file mode 100644 index 00000000000..d858f58fa56 --- /dev/null +++ b/cpp/tests/io/parquet_bloom_filter_test.cu @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +using StringType = cudf::string_view; + +class ParquetBloomFilterTest : public cudf::test::BaseFixture {}; + +TEST_F(ParquetBloomFilterTest, TestStrings) +{ + using key_type = StringType; + using policy_type = cuco::arrow_filter_policy; + using word_type = policy_type::word_type; + + std::size_t constexpr num_filter_blocks = 4; + auto stream = cudf::get_default_stream(); + + // strings keys to insert + auto keys = cudf::test::strings_column_wrapper( + {"seventh", "fifteenth", "second", "tenth", "fifth", "first", + "seventh", "tenth", "ninth", "ninth", "seventeenth", "eighteenth", + "thirteenth", "fifth", "fourth", "twelfth", "second", "second", + "fourth", "seventh", "seventh", "tenth", "thirteenth", "seventeenth", + "fifth", "seventeenth", "eighth", "fourth", "second", "eighteenth", + "fifteenth", "second", "seventeenth", "thirteenth", "eighteenth", "fifth", + "seventh", "tenth", "fourteenth", "first", "fifth", "fifth", + "tenth", "thirteenth", "fourteenth", "third", "third", "sixth", + "first", "third"}); + + auto d_keys = cudf::column_device_view::create(keys); + + // Spawn a bloom filter + cuco::bloom_filter, + cuda::thread_scope_device, + policy_type, + cudf::detail::cuco_allocator> + filter{num_filter_blocks, + cuco::thread_scope_device, + {{cudf::DEFAULT_HASH_SEED}}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream}; + + // Add strings to the bloom filter + filter.add(d_keys->begin(), d_keys->end(), stream); + + // Number of words in the filter + cudf::size_type const num_words = filter.block_extent() * filter.words_per_block; + + // Filter bitset as a column + auto const bitset = cudf::column_view{ + cudf::data_type{cudf::type_id::UINT32}, num_words, filter.data(), nullptr, 0, 0, {}}; + + // Expected filter bitset words computed using Arrow's implementation here: + // https://godbolt.org/z/oKfqcPWbY + auto expected = cudf::test::fixed_width_column_wrapper( + {4194306U, 4194305U, 2359296U, 1073774592U, 524544U, 1024U, 268443648U, + 8519680U, 2147500040U, 8421380U, 269500416U, 4202624U, 8396802U, 100665344U, + 2147747840U, 5243136U, 131146U, 655364U, 285345792U, 134222340U, 545390596U, + 2281717768U, 51201U, 41943553U, 1619656708U, 67441680U, 8462730U, 361220U, + 2216738864U, 587333888U, 4219272U, 873463873U}); + + // Check the bitset for equality + CUDF_TEST_EXPECT_COLUMNS_EQUAL(bitset, expected); +} diff --git a/cpp/tests/io/parquet_misc_test.cpp b/cpp/tests/io/parquet_misc_test.cpp index d66f685cd9c..419ac909ac6 100644 --- a/cpp/tests/io/parquet_misc_test.cpp +++ b/cpp/tests/io/parquet_misc_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -268,6 +268,7 @@ TEST_P(ParquetCompressionTest, Basic) INSTANTIATE_TEST_CASE_P(ParquetCompressionTest, ParquetCompressionTest, ::testing::Values(cudf::io::compression_type::NONE, + cudf::io::compression_type::AUTO, cudf::io::compression_type::SNAPPY, cudf::io::compression_type::LZ4, cudf::io::compression_type::ZSTD)); diff --git a/cpp/tests/reductions/host_udf_example_tests.cu b/cpp/tests/reductions/host_udf_example_tests.cu new file mode 100644 index 00000000000..67b88c5306b --- /dev/null +++ b/cpp/tests/reductions/host_udf_example_tests.cu @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +using doubles_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; +using int64s_col = cudf::test::fixed_width_column_wrapper; + +namespace { +/** + * @brief A host-based UDF implementation for reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_reduction_example : cudf::reduce_host_udf { + host_udf_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher( + input.type(), output_dtype, reduce_fn{}, input, output_dtype, init, stream, mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::data_type output_dtype, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + if (input.size() == 0) { + return cudf::make_default_constructed_scalar(output_dtype, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto const result = thrust::transform_reduce(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + transform_fn{*input_dv_ptr}, + static_cast(init_value), + thrust::plus<>{}); + + auto output = cudf::make_numeric_scalar(output_dtype, stream, mr); + static_cast*>(output.get())->set_value(result, stream); + return output; + } + + struct transform_fn { + cudf::column_device_view values; + OutputType __device__ operator()(cudf::size_type idx) const + { + if (values.is_null(idx)) { return OutputType{0}; } + auto const val = static_cast(values.element(idx)); + return val * val; + } + }; + }; +}; + +} // namespace + +struct HostUDFReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFReductionExampleTest, SimpleInput) +{ + auto const vals = doubles_col{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_TRUE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); + auto const result = + static_cast*>(reduced.get())->value(cudf::get_default_stream()); + auto constexpr expected = 55; // 0^2 + 1^2 + 2^2 + 3^2 + 4^2 + 5^2 = 55 + EXPECT_EQ(expected, result); +} + +TEST_F(HostUDFReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const reduced = cudf::reduce(vals, + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + EXPECT_FALSE(reduced->is_valid()); + EXPECT_EQ(cudf::type_id::INT64, reduced->type().id()); +} + +namespace { + +/** + * @brief A host-based UDF implementation for segmented reduction. + * + * The aggregation computes `sum(value^2, for value in group)` (this is sum of squared). + */ +struct host_udf_segmented_reduction_example : cudf::segmented_reduce_host_udf { + host_udf_segmented_reduction_example() = default; + + [[nodiscard]] std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const override + { + return cudf::double_type_dispatcher(input.type(), + output_dtype, + segmented_reduce_fn{}, + input, + offsets, + output_dtype, + null_handling, + init, + stream, + mr); + } + + [[nodiscard]] bool is_equal(host_udf_base const& other) const override + { + // Just check if the other object is also instance of this class. + return dynamic_cast(&other) != nullptr; + } + + [[nodiscard]] std::unique_ptr clone() const override + { + return std::make_unique(); + } + + struct segmented_reduce_fn { + // For simplicity, this example only accepts a single type input and output. + using InputType = double; + using OutputType = int64_t; + + template || !std::is_same_v)> + std::unique_ptr operator()(Args...) const + { + CUDF_FAIL("Unsupported input/output type."); + } + + template && std::is_same_v)> + std::unique_ptr operator()( + cudf::column_view const& input, + cudf::device_span offsets, + cudf::data_type output_dtype, + cudf::null_policy null_handling, + std::optional> init, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const + { + CUDF_EXPECTS(output_dtype == cudf::data_type{cudf::type_to_id()}, + "Invalid output type."); + CUDF_EXPECTS(offsets.size() > 0, "Invalid offsets."); + auto const num_segments = static_cast(offsets.size()) - 1; + + if (input.size() == 0) { + if (num_segments <= 0) { return cudf::make_empty_column(output_dtype); } + return cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::ALL_NULL, stream, mr); + } + + auto const init_value = [&]() -> InputType { + if (init.has_value() && init.value().get().is_valid(stream)) { + auto const numeric_init_scalar = + dynamic_cast const*>(&init.value().get()); + CUDF_EXPECTS(numeric_init_scalar != nullptr, "Invalid init scalar for reduction."); + return numeric_init_scalar->value(stream); + } + return InputType{0}; + }(); + + auto const input_dv_ptr = cudf::column_device_view::create(input, stream); + auto output = cudf::make_numeric_column( + output_dtype, num_segments, cudf::mask_state::UNALLOCATED, stream); + + // Store row index if it is valid, otherwise store a negative value denoting a null row. + rmm::device_uvector valid_idx(num_segments, stream); + + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_segments), + thrust::make_zip_iterator(output->mutable_view().begin(), valid_idx.begin()), + transform_fn{*input_dv_ptr, offsets, static_cast(init_value), null_handling}); + + auto const valid_idx_cv = cudf::column_view{ + cudf::data_type{cudf::type_id::INT32}, num_segments, valid_idx.begin(), nullptr, 0}; + return std::move(cudf::gather(cudf::table_view{{output->view()}}, + valid_idx_cv, + cudf::out_of_bounds_policy::NULLIFY, + stream, + mr) + ->release() + .front()); + } + + struct transform_fn { + cudf::column_device_view values; + cudf::device_span offsets; + OutputType init_value; + cudf::null_policy null_handling; + + thrust::tuple __device__ operator()(cudf::size_type idx) const + { + auto const start = offsets[idx]; + auto const end = offsets[idx + 1]; + + auto constexpr invalid_idx = cuda::std::numeric_limits::lowest(); + if (start == end) { return {OutputType{0}, invalid_idx}; } + + auto sum = init_value; + for (auto i = start; i < end; ++i) { + if (values.is_null(i)) { + if (null_handling == cudf::null_policy::INCLUDE) { sum += init_value * init_value; } + continue; + } + auto const val = static_cast(values.element(i)); + sum += val * val; + } + auto const segment_size = end - start; + return {static_cast(segment_size) * sum, idx}; + } + }; + }; +}; + +} // namespace + +struct HostUDFSegmentedReductionExampleTest : cudf::test::BaseFixture {}; + +TEST_F(HostUDFSegmentedReductionExampleTest, SimpleInput) +{ + double constexpr null = 0.0; + auto const vals = doubles_col{{0.0, null, 2.0, 3.0, null, 5.0, null, null, 8.0, 9.0}, + {true, false, true, true, false, true, false, false, true, true}}; + auto const offsets = int32s_col{0, 3, 5, 10}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + + // Test without init value. + { + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // Since init value is not given, it is set to 0. + // [ 3 * (0^2 + init^2 + 2^2), 2 * (3^2 + init^2), 5 * (5^2 + init^2 + init^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{12, 18, 850}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and include nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // When null_policy is set to `INCLUDE`, the null values are replaced with the init value. + // [ 3 * (3 + 0^2 + 3^2 + 2^2), 2 * (3 + 3^2 + 3^2), 5 * (3 + 5^2 + 3^2 + 3^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{48, 42, 955}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } + + // Test with init value, and exclude nulls. + { + auto const init_scalar = cudf::make_fixed_width_scalar(3.0); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::EXCLUDE, + *init_scalar, + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + + // [ 3 * (3 + 0^2 + 2^2), 2 * (3 + 3^2), 5 * (3 + 5^2 + 8^2 + 9^2) ] + auto const expected = int64s_col{{21, 24, 865}, {true, true, true}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); + } +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptySegments) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{0, 0, 0, 0}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{{0, 0, 0}, {false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(HostUDFSegmentedReductionExampleTest, EmptyInput) +{ + auto const vals = doubles_col{}; + auto const offsets = int32s_col{}.release(); + auto const agg = cudf::make_host_udf_aggregation( + std::make_unique()); + auto const result = cudf::segmented_reduce( + vals, + cudf::device_span(offsets->view().begin(), offsets->size()), + *agg, + cudf::data_type{cudf::type_id::INT64}, + cudf::null_policy::INCLUDE, + std::nullopt, // init value + cudf::get_default_stream(), + cudf::get_current_device_resource_ref()); + auto const expected = int64s_col{}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp index 221c0a62f3e..47b9ac46d12 100644 --- a/cpp/tests/streams/text/ngrams_test.cpp +++ b/cpp/tests/streams/text/ngrams_test.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -45,7 +45,7 @@ TEST_F(TextNGramsTest, HashCharacterNgrams) auto input = cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."}); nvtext::hash_character_ngrams( - cudf::strings_column_view(input), 5, cudf::test::get_default_stream()); + cudf::strings_column_view(input), 5, 5, cudf::test::get_default_stream()); } TEST_F(TextNGramsTest, NgramsTokenize) diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp index c72c7cfc80e..1a737231389 100644 --- a/cpp/tests/text/ngrams_tests.cpp +++ b/cpp/tests/text/ngrams_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -159,6 +159,17 @@ TEST_F(TextGenerateNgramsTest, NgramsHash) 2319357747u}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::hash_character_ngrams(view, 10, 10); + // clang-format off + LCW expected2({LCW{2818025299u, 4026424618u, 578054337u, 2107870805u, 3942221995u, + 2802685757u, 2686450821u, 584898501u, 2206824201u, 487979059u}, + LCW{1154048732u, 3209682333u, 3246563372u, 3789750511u, 1287153502u, + 3759561568u, 1092423314u, 339538635u, 4265577390u, 879551618u, + 4222824617u, 1774528854u, 1028254379u, 485918316u, 879142987u, 3619248543u} + }); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } TEST_F(TextGenerateNgramsTest, NgramsHashErrors) diff --git a/java/src/main/java/ai/rapids/cudf/Aggregation.java b/java/src/main/java/ai/rapids/cudf/Aggregation.java index 2276b223740..c07a58ed8a5 100644 --- a/java/src/main/java/ai/rapids/cudf/Aggregation.java +++ b/java/src/main/java/ai/rapids/cudf/Aggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java index 27966ddfdd4..234a9ec1ced 100644 --- a/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/GroupByAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java index 0b6ecf2e140..124f2c99188 100644 --- a/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java +++ b/java/src/main/java/ai/rapids/cudf/HostUDFWrapper.java @@ -24,8 +24,10 @@ *

* A new host UDF aggregation implementation must extend this class and override the * {@code hashCode} and {@code equals} methods for such purposes. + * In addition, since this class implements {@code AutoCloseable}, the {@code close} method must + * also be overridden to automatically delete the native UDF instance upon class destruction. */ -public abstract class HostUDFWrapper { +public abstract class HostUDFWrapper implements AutoCloseable { public final long udfNativeHandle; public HostUDFWrapper(long udfNativeHandle) { diff --git a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java index ba8ae379bae..4f047a68f06 100644 --- a/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/ReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -161,14 +161,14 @@ public static ReductionAggregation median() { /** * Aggregate to compute the specified quantiles. Uses linear interpolation by default. */ - public static ReductionAggregation quantile(double ... quantiles) { + public static ReductionAggregation quantile(double... quantiles) { return new ReductionAggregation(Aggregation.quantile(quantiles)); } /** * Aggregate to compute various quantiles. */ - public static ReductionAggregation quantile(QuantileMethod method, double ... quantiles) { + public static ReductionAggregation quantile(QuantileMethod method, double... quantiles) { return new ReductionAggregation(Aggregation.quantile(method, quantiles)); } @@ -256,7 +256,7 @@ public static ReductionAggregation collectSet() { * @param nanEquality Flag to specify whether NaN values in floating point column should be considered equal. */ public static ReductionAggregation collectSet(NullPolicy nullPolicy, - NullEquality nullEquality, NaNEquality nanEquality) { + NullEquality nullEquality, NaNEquality nanEquality) { return new ReductionAggregation(Aggregation.collectSet(nullPolicy, nullEquality, nanEquality)); } @@ -286,6 +286,15 @@ public static ReductionAggregation mergeSets(NullEquality nullEquality, NaNEqual return new ReductionAggregation(Aggregation.mergeSets(nullEquality, nanEquality)); } + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new ReductionAggregation instance + */ + public static ReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new ReductionAggregation(Aggregation.hostUDF(wrapper)); + } + /** * Create HistogramAggregation, computing the frequencies for each unique row. * diff --git a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java index 7ed150a2fec..18e7d874886 100644 --- a/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java +++ b/java/src/main/java/ai/rapids/cudf/SegmentedReductionAggregation.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -101,4 +101,13 @@ public static SegmentedReductionAggregation any() { public static SegmentedReductionAggregation all() { return new SegmentedReductionAggregation(Aggregation.all()); } + + /** + * Execute a reduction using a host-side user-defined function (UDF). + * @param wrapper The wrapper for the native host UDF instance. + * @return A new SegmentedReductionAggregation instance + */ + public static SegmentedReductionAggregation hostUDF(HostUDFWrapper wrapper) { + return new SegmentedReductionAggregation(Aggregation.hostUDF(wrapper)); + } } diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 30da8727366..19f2802553d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2413,7 +2413,11 @@ def as_column( and pa.types.is_integer(arbitrary.type) and arbitrary.null_count > 0 ): + # TODO: Need to re-visit this cast and fill_null + # calls while addressing the following issue: + # https://github.com/rapidsai/cudf/issues/14149 arbitrary = arbitrary.cast(pa.float64()) + arbitrary = pc.fill_null(arbitrary, np.nan) if ( cudf.get_option("default_integer_bitwidth") and pa.types.is_integer(arbitrary.type) diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet new file mode 100644 index 00000000000..4123545a6e0 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet differ diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet new file mode 100644 index 00000000000..7dc2cee21ae Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_100_chunk_stats.snappy.parquet differ diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet new file mode 100644 index 00000000000..e898f1d7d1b Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet differ diff --git a/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet new file mode 100644 index 00000000000..3060234d499 Binary files /dev/null and b/python/cudf/cudf/tests/data/parquet/mixed_card_ndv_500_chunk_stats.snappy.parquet differ diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py index 77d1f77d30b..9d5f32c7ab9 100644 --- a/python/cudf/cudf/tests/test_parquet.py +++ b/python/cudf/cudf/tests/test_parquet.py @@ -1,6 +1,7 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime +import decimal import glob import hashlib import math @@ -4370,3 +4371,57 @@ def test_parquet_reader_mismatched_nullability_structs(tmpdir): cudf.read_parquet([buf2, buf1]), cudf.concat([df2, df1]).reset_index(drop=True), ) + + +@pytest.mark.parametrize( + "stats_fname,bloom_filter_fname", + [ + ( + "mixed_card_ndv_100_chunk_stats.snappy.parquet", + "mixed_card_ndv_100_bf_fpp0.1_nostats.snappy.parquet", + ), + ( + "mixed_card_ndv_500_chunk_stats.snappy.parquet", + "mixed_card_ndv_500_bf_fpp0.1_nostats.snappy.parquet", + ), + ], +) +@pytest.mark.parametrize( + "predicate,expected_len", + [ + ([[("str", "==", "FINDME")], [("fp64", "==", float(500))]], 2), + ([("fixed_pt", "==", decimal.Decimal(float(500)))], 2), + ([[("ui32", "==", np.uint32(500)), ("str", "==", "FINDME")]], 2), + ([[("str", "==", "FINDME")], [("ui32", ">=", np.uint32(0))]], 1000), + ( + [ + ("str", "!=", "FINDME"), + ("fixed_pt", "==", decimal.Decimal(float(500))), + ], + 0, + ), + ], +) +def test_parquet_bloom_filters( + datadir, stats_fname, bloom_filter_fname, predicate, expected_len +): + fname_stats = datadir / stats_fname + fname_bf = datadir / bloom_filter_fname + df_stats = cudf.read_parquet(fname_stats, filters=predicate).reset_index( + drop=True + ) + df_bf = cudf.read_parquet(fname_bf, filters=predicate).reset_index( + drop=True + ) + + # Check if tables equal + assert_eq( + df_stats, + df_bf, + ) + + # Check for table length + assert_eq( + len(df_stats), + expected_len, + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index f8697c5c6b8..891c0ede9a4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import decimal import hashlib @@ -3003,3 +3003,12 @@ def test_dtype_dtypes_equal(): ser = cudf.Series([0]) assert ser.dtype is ser.dtypes assert ser.dtypes is ser.to_pandas().dtypes + + +def test_null_like_to_nan_pandas_compat(): + with cudf.option_context("mode.pandas_compatible", True): + ser = cudf.Series([1, 2, np.nan, 10, None]) + pser = pd.Series([1, 2, np.nan, 10, None]) + + assert pser.dtype == ser.dtype + assert_eq(ser, pser) diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index c16df320ceb..e453a8b89b9 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -8,7 +8,9 @@ from functools import partialmethod from typing import TYPE_CHECKING +import fastexcel import pytest +from packaging import version import polars @@ -44,7 +46,7 @@ def pytest_configure(config: pytest.Config) -> None: ) -EXPECTED_FAILURES: Mapping[str, str] = { +EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = { "tests/unit/io/test_csv.py::test_compressed_csv": "Need to determine if file is compressed", "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU", "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning", @@ -192,6 +194,10 @@ def pytest_configure(config: pytest.Config) -> None: # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", + "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": ( + "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698", + version.parse(fastexcel.__version__) >= version.parse("0.12.1"), + ), } @@ -219,4 +225,12 @@ def pytest_collection_modifyitems( if item.nodeid in TESTS_TO_SKIP: item.add_marker(pytest.mark.skip(reason=TESTS_TO_SKIP[item.nodeid])) elif item.nodeid in EXPECTED_FAILURES: + if isinstance(EXPECTED_FAILURES[item.nodeid], tuple): + # the second entry in the tuple is the condition to xfail on + item.add_marker( + pytest.mark.xfail( + condition=EXPECTED_FAILURES[item.nodeid][1], + reason=EXPECTED_FAILURES[item.nodeid][0], + ), + ) item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))