diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 65aebfb7f8c..f6b3fb83cdd 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -90,7 +90,7 @@ jobs: package-name: libcudf package-type: cpp wheel-build-pylibcudf: - needs: [wheel-publish-libcudf] + needs: [wheel-build-libcudf] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02 with: @@ -111,7 +111,7 @@ jobs: package-name: pylibcudf package-type: python wheel-build-cudf: - needs: wheel-publish-pylibcudf + needs: wheel-build-pylibcudf secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02 with: @@ -132,7 +132,7 @@ jobs: package-name: cudf package-type: python wheel-build-dask-cudf: - needs: wheel-publish-cudf + needs: wheel-build-cudf secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02 with: @@ -155,7 +155,7 @@ jobs: package-name: dask_cudf package-type: python wheel-build-cudf-polars: - needs: wheel-publish-pylibcudf + needs: wheel-build-pylibcudf secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d99b74506e4..965b667605c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: autoupdate_branch: "" autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate" autoupdate_schedule: quarterly - skip: ["verify-alpha-spec"] + skip: ["verify-alpha-spec", "nbqa-isort"] submodules: false repos: @@ -173,7 +173,7 @@ repos: ) - id: verify-alpha-spec - repo: https://github.com/rapidsai/dependency-file-generator - rev: v1.16.0 + rev: v1.17.0 hooks: - id: rapids-dependency-file-generator args: ["--clean"] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index dbb44890965..cc01f5286ef 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -67,7 +67,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.18 +- polars>=1.20,<1.22 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<20.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 1b674596a4b..d52cb85abe6 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -65,7 +65,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.4dev0 - pandoc -- polars>=1.11,<1.18 +- polars>=1.20,<1.22 - pre-commit - pyarrow>=14.0.0,<20.0.0a0 - pydata-sphinx-theme>=0.15.4 diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml index 7a0005497df..fb7ab9332d8 100644 --- a/conda/recipes/cudf-polars/meta.yaml +++ b/conda/recipes/cudf-polars/meta.yaml @@ -43,7 +43,7 @@ requirements: run: - python - pylibcudf ={{ version }} - - polars >=1.11,<1.18 + - polars >=1.20,<1.22 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp index 32bd945d57c..83e6c35216a 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp @@ -121,6 +121,10 @@ void BM_parquet_read_long_strings(nvbench::state& state) cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile); // THIS auto const view = tbl->view(); + // set smaller threshold to reduce file size and execution time + auto const threshold = 1; + setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1); + cudf::io::parquet_writer_options write_opts = cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view) .compression(compression); @@ -129,6 +133,7 @@ void BM_parquet_read_long_strings(nvbench::state& state) }(); parquet_read_common(num_rows_written, num_cols, source_sink, state); + unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); } template diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh index 88ec0c07dc5..358170f76db 100644 --- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh +++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -255,12 +255,14 @@ static sizes_to_offsets_iterator make_sizes_to_offsets_i * @param begin Input iterator for scan * @param end End of the input iterator * @param result Output iterator for scan result + * @param initial_offset Initial offset to add to scan * @return The last element of the scan */ template auto sizes_to_offsets(SizesIterator begin, SizesIterator end, OffsetsIterator result, + int64_t initial_offset, rmm::cuda_stream_view stream) { using SizeType = typename thrust::iterator_traits::value_type; @@ -273,7 +275,8 @@ auto sizes_to_offsets(SizesIterator begin, make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data()); // This function uses the type of the initialization parameter as the accumulator type // when computing the individual scan output elements. - thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0}); + thrust::exclusive_scan( + rmm::exec_policy_nosync(stream), begin, end, output_itr, static_cast(initial_offset)); return last_element.value(stream); } @@ -319,7 +322,8 @@ std::pair, size_type> make_offsets_child_column( }); auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); // Use the sizes-to-offsets iterator to compute the total number of elements - auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream); + auto const total_elements = + sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, 0, stream); CUDF_EXPECTS( total_elements <= static_cast(std::numeric_limits::max()), "Size of output exceeds the column size limit", diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 44a86f1c84f..135f645817e 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -73,7 +73,7 @@ CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus) * `modulus` is positive and does not check for overflow. */ template -constexpr S round_down_safe(S number_to_round, S modulus) noexcept +CUDF_HOST_DEVICE constexpr S round_down_safe(S number_to_round, S modulus) noexcept { auto remainder = number_to_round % modulus; auto rounded_down = number_to_round - remainder; @@ -113,16 +113,16 @@ CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexc * the result will be incorrect */ template -constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept +CUDF_HOST_DEVICE constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept { return (dividend + divisor - 1) / divisor; } namespace detail { template -constexpr I div_rounding_up_safe(std::integral_constant, - I dividend, - I divisor) noexcept +CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant, + I dividend, + I divisor) noexcept { // TODO: This could probably be implemented faster return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor) @@ -130,7 +130,9 @@ constexpr I div_rounding_up_safe(std::integral_constant, } template -constexpr I div_rounding_up_safe(std::integral_constant, I dividend, I divisor) noexcept +CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant, + I dividend, + I divisor) noexcept { auto quotient = dividend / divisor; auto remainder = dividend % divisor; @@ -156,9 +158,9 @@ constexpr I div_rounding_up_safe(std::integral_constant, I dividend, * the non-integral division `dividend/divisor` */ template -constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept +CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept { - using i_is_a_signed_type = std::integral_constant>; + using i_is_a_signed_type = cuda::std::integral_constant>; return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor); } diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp index 2bafe235058..643d1d07cb7 100644 --- a/cpp/include/cudf/fixed_point/temporary.hpp +++ b/cpp/include/cudf/fixed_point/temporary.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -54,7 +54,7 @@ auto to_string(T value) -> std::string } template -constexpr auto abs(T value) +CUDF_HOST_DEVICE constexpr auto abs(T value) { return value >= 0 ? value : -value; } @@ -72,7 +72,7 @@ CUDF_HOST_DEVICE inline auto max(T lhs, T rhs) } template -constexpr auto exp10(int32_t exponent) +CUDF_HOST_DEVICE constexpr auto exp10(int32_t exponent) { BaseType value = 1; while (exponent > 0) diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp index 32187b43d34..24b8738d5dd 100644 --- a/cpp/include/cudf/io/text/detail/multistate.hpp +++ b/cpp/include/cudf/io/text/detail/multistate.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,8 @@ #include +#include + #include namespace CUDF_EXPORT cudf { @@ -45,7 +47,7 @@ struct multistate { * * @note: The behavior of this function is undefined if size() => max_segment_count */ - constexpr void enqueue(uint8_t head, uint8_t tail) + CUDF_HOST_DEVICE constexpr void enqueue(uint8_t head, uint8_t tail) { _heads |= (head & 0xFu) << (_size * 4); _tails |= (tail & 0xFu) << (_size * 4); @@ -55,17 +57,17 @@ struct multistate { /** * @brief get's the number of segments this multistate represents */ - [[nodiscard]] constexpr uint8_t size() const { return _size; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t size() const { return _size; } /** * @brief get's the highest (____, tail] value this multistate represents */ - [[nodiscard]] constexpr uint8_t max_tail() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t max_tail() const { uint8_t maximum = 0; for (uint8_t i = 0; i < _size; i++) { - maximum = std::max(maximum, get_tail(i)); + maximum = cuda::std::max(maximum, get_tail(i)); } return maximum; @@ -74,7 +76,7 @@ struct multistate { /** * @brief get's the Nth (head, ____] value state this multistate represents */ - [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_head(uint8_t idx) const { return (_heads >> (idx * 4)) & 0xFu; } @@ -82,7 +84,7 @@ struct multistate { /** * @brief get's the Nth (____, tail] value state this multistate represents */ - [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_tail(uint8_t idx) const { return (_tails >> (idx * 4)) & 0xFu; } diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh index 8440805960e..5ae4af411b6 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include +#include #include #include #include @@ -46,7 +47,7 @@ __device__ inline thrust::pair parse_integer( // highest value where another decimal digit cannot be appended without an overflow; // this preserves the most digits when scaling the final result for this type constexpr UnsignedDecimalType decimal_max = - (std::numeric_limits::max() - 9L) / 10L; + (cuda::std::numeric_limits::max() - 9L) / 10L; __uint128_t value = 0; // for checking overflow int32_t exp_offset = 0; @@ -90,7 +91,8 @@ __device__ inline thrust::pair parse_integer( template __device__ cuda::std::optional parse_exponent(char const* iter, char const* iter_end) { - constexpr uint32_t exponent_max = static_cast(std::numeric_limits::max()); + constexpr uint32_t exponent_max = + static_cast(cuda::std::numeric_limits::max()); // get optional exponent sign int32_t const exp_sign = [&iter] { diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh index 0ee26ec9ee2..af4a4ce7cd2 100644 --- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh +++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include +#include + namespace cudf::strings::detail { /** @@ -33,7 +35,7 @@ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32 auto const abs_value = numeric::detail::abs(value); auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); auto const fraction = count_digits(abs_value % exp_ten); - auto const num_zeros = std::max(0, (-scale - fraction)); + auto const num_zeros = cuda::std::max(0, (-scale - fraction)); return static_cast(value < 0) + // sign if negative count_digits(abs_value / exp_ten) + // integer 1 + // decimal point @@ -66,7 +68,7 @@ __device__ inline void fixed_point_to_string(__int128_t const& value, int32_t sc if (value < 0) *out_ptr++ = '-'; // add sign auto const abs_value = numeric::detail::abs(value); auto const exp_ten = numeric::detail::exp10<__int128_t>(-scale); - auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten))); + auto const num_zeros = cuda::std::max(0, (-scale - count_digits(abs_value % exp_ten))); out_ptr += integer_to_string(abs_value / exp_ten, out_ptr); // add the integer part *out_ptr++ = '.'; // add decimal point diff --git a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh index f6e6a10a864..39b9cd6978c 100644 --- a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh +++ b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -67,7 +67,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer) * @return size_type number of digits in input value */ template -constexpr size_type count_digits(IntegerType value) +__device__ constexpr size_type count_digits(IntegerType value) { if (value == 0) return 1; bool const is_negative = cuda::std::is_signed() ? (value < 0) : false; diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh index de2f1770e28..cd386ea886f 100644 --- a/cpp/include/cudf/strings/detail/strings_children.cuh +++ b/cpp/include/cudf/strings/detail/strings_children.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -41,6 +41,21 @@ namespace cudf { namespace strings { namespace detail { +template +struct string_offsets_fn { + Iter _begin; + size_type _strings_count; + constexpr string_offsets_fn(Iter begin, size_type strings_count) + : _begin{begin}, _strings_count{strings_count} + { + } + + __device__ constexpr size_type operator()(size_type idx) const noexcept + { + return idx < _strings_count ? static_cast(_begin[idx]) : size_type{0}; + }; +}; + /** * @brief Gather characters to create a strings column using the given string-index pair iterator * @@ -133,14 +148,11 @@ std::pair, int64_t> make_offsets_child_column( // using exclusive-scan technically requires strings_count+1 input values even though // the final input value is never used. // The input iterator is wrapped here to allow the 'last value' to be safely read. - auto map_fn = cuda::proclaim_return_type( - [begin, strings_count] __device__(size_type idx) -> size_type { - return idx < strings_count ? static_cast(begin[idx]) : size_type{0}; - }); - auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn); + auto input_itr = + cudf::detail::make_counting_transform_iterator(0, string_offsets_fn{begin, strings_count}); // Use the sizes-to-offsets iterator to compute the total number of elements auto const total_bytes = - cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream); + cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, 0, stream); auto const threshold = cudf::strings::get_offset64_threshold(); CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold), @@ -151,7 +163,8 @@ std::pair, int64_t> make_offsets_child_column( offsets_column = make_numeric_column( data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); auto d_offsets64 = offsets_column->mutable_view().template data(); - cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream); + cudf::detail::sizes_to_offsets( + input_itr, input_itr + strings_count + 1, d_offsets64, 0, stream); } return std::pair(std::move(offsets_column), total_bytes); diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh index bfdba238a1e..0463eca65e9 100644 --- a/cpp/src/io/csv/datetime.cuh +++ b/cpp/src/io/csv/datetime.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,7 +197,7 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss extract_time_of_d /** * @brief Checks whether `c` is decimal digit */ -constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; } +__device__ constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; } /** * @brief Parses a datetime string and computes the corresponding timestamp. diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1a0c59e365a..1587c4da9c8 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -376,6 +376,48 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, {}); } +struct scatter_fn { + column_device_view _col; + size_type* _d_strview_offsets; + string_view* _d_strviews; + size_type const* _labels; + size_type const* _list_offsets; + column_device_view _d_strings_children; + string_view _element_seperator; + string_view _element_narep; + + scatter_fn(column_device_view col, + size_type* d_strview_offsets, + string_view* d_strviews, + size_type const* labels, + size_type const* list_offsets, + column_device_view d_strings_children, + string_view const element_separator, + string_view const element_narep) noexcept + : _col{col}, + _d_strview_offsets{d_strview_offsets}, + _d_strviews{d_strviews}, + _labels{labels}, + _list_offsets{list_offsets}, + _d_strings_children{d_strings_children}, + _element_seperator{element_separator}, + _element_narep{element_narep} + { + } + + __device__ void operator()(size_type idx) const + { + auto const label = _labels[idx]; + auto const sublist_index = idx - _list_offsets[label]; + auto const strview_index = _d_strview_offsets[label] + sublist_index * 2 + 1; + // value or na_rep + auto const strview = _d_strings_children.element(idx); + _d_strviews[strview_index] = _d_strings_children.is_null(idx) ? _element_narep : strview; + // separator + if (sublist_index != 0) { _d_strviews[strview_index - 1] = _element_seperator; } + } +}; + /** * @brief Concatenates a list of strings columns into a single strings column. * @@ -461,24 +503,14 @@ std::unique_ptr join_list_of_strings(lists_column_view const& lists_stri thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(num_strings), - [col = *col_device_view, - d_strview_offsets = d_strview_offsets.begin(), - d_strviews = d_strviews.begin(), - labels = labels->view().begin(), - list_offsets = offsets.begin(), - d_strings_children = *d_strings_children, - element_separator, - element_narep] __device__(auto idx) { - auto const label = labels[idx]; - auto const sublist_index = idx - list_offsets[label]; - auto const strview_index = d_strview_offsets[label] + sublist_index * 2 + 1; - // value or na_rep - auto const strview = d_strings_children.element(idx); - d_strviews[strview_index] = - d_strings_children.is_null(idx) ? element_narep : strview; - // separator - if (sublist_index != 0) { d_strviews[strview_index - 1] = element_separator; } - }); + scatter_fn{*col_device_view, + d_strview_offsets.data(), + d_strviews.data(), + labels->view().data(), + offsets.data(), + *d_strings_children, + element_separator, + element_narep}); auto joined_col = make_strings_column(d_strviews, string_view{nullptr, 0}, stream, mr); diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 5ab36fdae8e..8dccf65ef10 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -707,7 +707,7 @@ struct orc_column_device_view : public column_device_view { struct rowgroup_rows { size_type begin; size_type end; - [[nodiscard]] constexpr auto size() const noexcept { return end - begin; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; } }; } // namespace orc diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index e01b93262d7..5f4c1e0696d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -22,6 +22,8 @@ #include +#include + namespace cudf::io::orc::gpu { using strings::detail::fixed_point_string_size; @@ -212,7 +214,7 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r } // Splits a nanosecond timestamp into milliseconds and nanoseconds -__device__ std::pair split_nanosecond_timestamp(int64_t nano_count) +__device__ cuda::std::pair split_nanosecond_timestamp(int64_t nano_count) { auto const ns = cuda::std::chrono::nanoseconds(nano_count); auto const ms_floor = cuda::std::chrono::floor(ns); diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index bcdd059bf67..857daeb5856 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -413,8 +414,8 @@ static __device__ uint32_t IntegerRLE( // Find minimum and maximum values if (literal_run > 0) { // Find min & max - T vmin = (t < literal_run) ? v0 : std::numeric_limits::max(); - T vmax = (t < literal_run) ? v0 : std::numeric_limits::min(); + T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits::max(); + T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits::min(); uint32_t literal_mode, literal_w; vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min()); __syncthreads(); @@ -448,7 +449,7 @@ static __device__ uint32_t IntegerRLE( } else { uint32_t range, w; // Mode 2 base value cannot be bigger than max int64_t, i.e. the first bit has to be 0 - if (vmin <= std::numeric_limits::max() and mode1_w > mode2_w and + if (vmin <= cuda::std::numeric_limits::max() and mode1_w > mode2_w and (literal_run - 1) * (mode1_w - mode2_w) > 4) { s->u.intrle.literal_mode = 2; w = mode2_w; diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu index 0c739f59b0a..5e23bc5adcc 100644 --- a/cpp/src/io/orc/stripe_init.cu +++ b/cpp/src/io/orc/stripe_init.cu @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -243,9 +244,9 @@ enum row_entry_state_e { */ static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap) { - constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2}; + constexpr cuda::std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2}; - std::array partial_order; + cuda::std::array partial_order; thrust::copy_if(thrust::seq, full_order.cbegin(), full_order.cend(), diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index a0cd126cff0..5c3377a1aeb 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -71,7 +71,7 @@ namespace cudf::io::orc::detail { template -[[nodiscard]] constexpr int varint_size(T val) +[[nodiscard]] CUDF_HOST_DEVICE constexpr int varint_size(T val) { auto len = 1u; while (val > 0x7f) { diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp index b8e72aaac88..023402cbcf6 100644 --- a/cpp/src/io/parquet/compact_protocol_reader.cpp +++ b/cpp/src/io/parquet/compact_protocol_reader.cpp @@ -359,10 +359,10 @@ class parquet_field_struct : public parquet_field { template class parquet_field_union_struct : public parquet_field { E& enum_val; - std::optional& val; // union structs are always wrapped in std::optional + cuda::std::optional& val; // union structs are always wrapped in std::optional public: - parquet_field_union_struct(int f, E& ev, std::optional& v) + parquet_field_union_struct(int f, E& ev, cuda::std::optional& v) : parquet_field(f), enum_val(ev), val(v) { } diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu index f63a4fb79b9..84f751dea6b 100644 --- a/cpp/src/io/parquet/decode_fixed.cu +++ b/cpp/src/io/parquet/decode_fixed.cu @@ -942,6 +942,7 @@ constexpr bool is_split_decode() * @param chunks List of column chunks * @param min_row Row index to start reading at * @param num_rows Maximum number of rows to read + * @param initial_str_offsets Vector to store the initial offsets for large nested string cols * @param error_code Error code to set if an error is encountered */ template @@ -950,6 +951,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { constexpr bool has_dict_t = has_dict(); @@ -1161,11 +1163,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8) valid_count = next_valid_count; } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. if constexpr (has_strings_t) { - if (!s->col.is_large_string_col) { - convert_small_string_lengths_to_offsets(s); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_lists_t); + } else { + convert_small_string_lengths_to_offsets(s, has_lists_t); } } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } @@ -1185,6 +1190,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, decode_kernel_mask kernel_mask, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -1199,11 +1205,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodePageDataGeneric <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodePageDataGeneric <<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } }; diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu index 5b9831668e6..2f402e3c4b8 100644 --- a/cpp/src/io/parquet/decode_preprocess.cu +++ b/cpp/src/io/parquet/decode_preprocess.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,7 +57,7 @@ __device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t) delta_binary_decoder string_lengths; auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end); // distance is size of string data - return static_cast(std::distance(string_start, s->data_end)); + return static_cast(thrust::distance(string_start, s->data_end)); } return 0; } diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh index 1fa05b3a6c2..339a6233c4d 100644 --- a/cpp/src/io/parquet/delta_binary.cuh +++ b/cpp/src/io/parquet/delta_binary.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -105,7 +105,7 @@ struct delta_binary_decoder { // returns the value stored in the `value` array at index // `rolling_index(idx)`. If `idx` is `0`, then return `first_value`. - constexpr zigzag128_t value_at(size_type idx) + __device__ constexpr zigzag128_t value_at(size_type idx) { return idx == 0 ? first_value : value[rolling_index(idx)]; } @@ -113,7 +113,7 @@ struct delta_binary_decoder { // returns the number of values encoded in the block data. when all_values is true, // account for the first value in the header. otherwise just count the values encoded // in the mini-block data. - constexpr uint32_t num_encoded_values(bool all_values) + __device__ constexpr uint32_t num_encoded_values(bool all_values) { return value_count == 0 ? 0 : all_values ? value_count : value_count - 1; } diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh index 49f4ccedbf0..56b7c8065ee 100644 --- a/cpp/src/io/parquet/delta_enc.cuh +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,8 @@ #include #include +#include +#include namespace cudf::io::parquet::detail { @@ -57,7 +59,7 @@ constexpr int buffer_size = 2 * block_size; static_assert(block_size % 128 == 0); static_assert(values_per_mini_block % 32 == 0); -constexpr int rolling_idx(int index) { return rolling_index(index); } +__device__ constexpr int rolling_idx(int index) { return rolling_index(index); } // Version of bit packer that can handle up to 64 bits values. // T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long @@ -67,8 +69,8 @@ template inline __device__ void bitpack_mini_block( uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space) { - using wide_type = - std::conditional_t, __uint128_t, uint64_t>; + using wide_type = cuda::std:: + conditional_t, __uint128_t, uint64_t>; using cudf::detail::warp_size; scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1; auto constexpr div = sizeof(scratch_type) * 8; @@ -235,7 +237,7 @@ class delta_binary_packer { size_type const idx = _current_idx + t; T const delta = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)], _buffer[delta::rolling_idx(idx - 1)]) - : std::numeric_limits::max(); + : cuda::std::numeric_limits::max(); // Find min delta for the block. auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min()); diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh index a5023e23cc5..b101733d35e 100644 --- a/cpp/src/io/parquet/page_decode.cuh +++ b/cpp/src/io/parquet/page_decode.cuh @@ -28,7 +28,7 @@ namespace cudf::io::parquet::detail { struct page_state_s { - constexpr page_state_s() noexcept {} + CUDF_HOST_DEVICE constexpr page_state_s() noexcept {} uint8_t const* data_start{}; uint8_t const* data_end{}; uint8_t const* lvl_end{}; @@ -121,7 +121,8 @@ struct null_count_back_copier { /** * @brief Test if the given page is in a string column */ -constexpr bool is_string_col(PageInfo const& page, device_span chunks) +__device__ constexpr bool is_string_col(PageInfo const& page, + device_span chunks) { if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; } auto const& col = chunks[page.chunk_idx]; diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu index 0c9d4e77f0c..4c98a08006c 100644 --- a/cpp/src/io/parquet/page_delta_decode.cu +++ b/cpp/src/io/parquet/page_delta_decode.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -435,6 +435,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { using cudf::detail::warp_size; @@ -579,17 +580,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) __syncthreads(); } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. - if (not s->col.is_large_string_col) { - int value_count = nesting_info_base[leaf_level_index].value_count; - - // if no repetition we haven't calculated start/end bounds and instead just skipped - // values until we reach first_row. account for that here. - if (!has_repetition) { value_count -= s->first_row; } - - auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); - block_excl_sum(offptr, value_count, s->page.str_offset); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition); + } else { + convert_small_string_lengths_to_offsets(s, has_repetition); } if (t == 0 and s->error != 0) { set_error(s->error, error_code); } @@ -603,6 +600,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) device_span chunks, size_t min_row, size_t num_rows, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code) { using cudf::detail::warp_size; @@ -741,17 +739,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size) __syncthreads(); } - // Now turn the array of lengths into offsets, but skip if this is a large string column. In the - // latter case, offsets will be computed during string column creation. - if (not s->col.is_large_string_col) { - int value_count = nesting_info_base[leaf_level_index].value_count; - - // if no repetition we haven't calculated start/end bounds and instead just skipped - // values until we reach first_row. account for that here. - if (!has_repetition) { value_count -= s->first_row; } - - auto const offptr = reinterpret_cast(nesting_info_base[leaf_level_index].data_out); - block_excl_sum(offptr, value_count, s->page.str_offset); + // For large strings, update the initial string buffer offset to be used during large string + // column construction. Otherwise, convert string sizes to final offsets. + if (s->col.is_large_string_col) { + compute_initial_large_strings_offset( + s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition); + } else { + convert_small_string_lengths_to_offsets(s, has_repetition); } // finally, copy the string data into place @@ -797,6 +791,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -807,10 +802,10 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodeDeltaByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodeDeltaByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } } @@ -822,6 +817,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream) { @@ -832,10 +828,10 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, if (level_type_size == 1) { gpuDecodeDeltaLengthByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } else { gpuDecodeDeltaLengthByteArray<<>>( - pages.device_ptr(), chunks, min_row, num_rows, error_code); + pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code); } } diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu index 7dc1255af6f..56d638c68eb 100644 --- a/cpp/src/io/parquet/page_enc.cu +++ b/cpp/src/io/parquet/page_enc.cu @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -59,7 +62,7 @@ constexpr int encode_block_size = 128; constexpr int rle_buffer_size = 2 * encode_block_size; constexpr int num_encode_warps = encode_block_size / cudf::detail::warp_size; -constexpr int rolling_idx(int pos) { return rolling_index(pos); } +__device__ constexpr int rolling_idx(int pos) { return rolling_index(pos); } // max V1 header size // also valid for dict page header (V1 or V2) @@ -113,7 +116,7 @@ using rle_page_enc_state_s = page_enc_state_s; /** * @brief Returns the size of the type in the Parquet file. */ -constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length) +__device__ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length) { if (physical_type == FIXED_LEN_BYTE_ARRAY) { return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length; @@ -127,7 +130,7 @@ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_le } } -constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values) +__device__ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values) { if (value_bit_width == 0) return 0; @@ -145,7 +148,7 @@ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_value } // subtract b from a, but return 0 if this would underflow -constexpr size_t underflow_safe_subtract(size_t a, size_t b) +__device__ constexpr size_t underflow_safe_subtract(size_t a, size_t b) { if (b > a) { return 0; } return a - b; @@ -228,7 +231,8 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t) __syncthreads(); // page fragment size must fit in a 32-bit signed integer - if (s->frag.fragment_data_size > static_cast(std::numeric_limits::max())) { + if (s->frag.fragment_data_size > + static_cast(cuda::std::numeric_limits::max())) { // TODO need to propagate this error back to the host CUDF_UNREACHABLE("page fragment size exceeds maximum for i32"); } @@ -357,7 +361,7 @@ struct BitwiseOr { template __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, void* temp_space) { - using output_type = std::conditional_t; + using output_type = cuda::std::conditional_t; __shared__ delta_binary_packer packer; auto const t = threadIdx.x; @@ -737,7 +741,7 @@ CUDF_KERNEL void __launch_bounds__(128) : frag_g.fragment_data_size; // page fragment size must fit in a 32-bit signed integer - if (fragment_data_size > std::numeric_limits::max()) { + if (fragment_data_size > cuda::std::numeric_limits::max()) { CUDF_UNREACHABLE("page fragment size exceeds maximum for i32"); } @@ -816,7 +820,7 @@ CUDF_KERNEL void __launch_bounds__(128) page_size + rle_pad + (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size); // page size must fit in 32-bit signed integer - if (max_data_size > std::numeric_limits::max()) { + if (max_data_size > cuda::std::numeric_limits::max()) { CUDF_UNREACHABLE("page size exceeds maximum for i32"); } // if byte_array then save the variable bytes size @@ -1321,7 +1325,7 @@ static __device__ void PlainBoolEncode(rle_page_enc_state_s* s, * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period * of hours. */ -constexpr auto julian_calendar_epoch_diff() +__device__ constexpr auto julian_calendar_epoch_diff() { using namespace cuda::std::chrono; using namespace cuda::std::chrono_literals; @@ -1346,7 +1350,7 @@ __device__ auto julian_days_with_time(int64_t v) auto const dur_time_of_day = dur_total - dur_days; auto const dur_time_of_day_nanos = duration_cast(dur_time_of_day); auto const julian_days = dur_days + ceil(julian_calendar_epoch_diff()); - return std::make_pair(dur_time_of_day_nanos, julian_days); + return cuda::std::pair{dur_time_of_day_nanos, julian_days}; } // this has been split out into its own kernel because of the amount of shared memory required @@ -1711,7 +1715,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) : 0; val_idx = val_idx_in_leaf_col; } - return std::make_tuple(is_valid, val_idx); + return cuda::std::make_tuple(is_valid, val_idx); }(); cur_val_idx += nvals; @@ -1950,7 +1954,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) // need to test for use_dictionary because it might be boolean uint32_t const val_idx = (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col; - return std::make_tuple(is_valid, val_idx); + return cuda::std::tuple{is_valid, val_idx}; }(); cur_val_idx += nvals; @@ -2200,7 +2204,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8) auto const arr_size = get_element(*s->col.leaf_column, val_idx).size_bytes(); // the lengths are assumed to be INT32, check for overflow - if (arr_size > static_cast(std::numeric_limits::max())) { + if (arr_size > static_cast(cuda::std::numeric_limits::max())) { CUDF_UNREACHABLE("byte array size exceeds 2GB"); } v = static_cast(arr_size); @@ -2641,7 +2645,7 @@ class header_encoder { cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST); auto const t_num = static_cast(type); current_header_ptr = cpw_put_uint8( - current_header_ptr, static_cast((std::min(len, size_t{0xfu}) << 4) | t_num)); + current_header_ptr, static_cast((cuda::std::min(len, size_t{0xfu}) << 4) | t_num)); if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); } current_field_index = 0; } @@ -2802,10 +2806,8 @@ __device__ bool increment_utf8_at(unsigned char* ptr) * * @return Pair object containing a pointer to the truncated data and its length. */ -__device__ std::pair truncate_utf8(device_span span, - bool is_min, - void* scratch, - int32_t truncate_length) +__device__ cuda::std::pair truncate_utf8( + device_span span, bool is_min, void* scratch, int32_t truncate_length) { // we know at this point that truncate_length < size_bytes, so // there is data at [len]. work backwards until we find @@ -2842,10 +2844,10 @@ __device__ std::pair truncate_utf8(device_span truncate_binary(device_span arr, - bool is_min, - void* scratch, - int32_t truncate_length) +__device__ cuda::std::pair truncate_binary(device_span arr, + bool is_min, + void* scratch, + int32_t truncate_length) { if (is_min) { return {arr.data(), truncate_length}; } memcpy(scratch, arr.data(), truncate_length); @@ -2869,10 +2871,10 @@ __device__ std::pair truncate_binary(device_span truncate_string(string_view const& str, - bool is_min, - void* scratch, - int32_t truncate_length) +__device__ cuda::std::pair truncate_string(string_view const& str, + bool is_min, + void* scratch, + int32_t truncate_length) { if (truncate_length == NO_TRUNC_STATS or str.size_bytes() <= truncate_length) { return {str.data(), str.size_bytes()}; @@ -2893,7 +2895,7 @@ __device__ std::pair truncate_string(string_view const& s /** * @brief Attempt to truncate a binary array to at most truncate_length bytes. */ -__device__ std::pair truncate_byte_array( +__device__ cuda::std::pair truncate_byte_array( statistics::byte_array_view const& arr, bool is_min, void* scratch, int32_t truncate_length) { if (truncate_length == NO_TRUNC_STATS or arr.size_bytes() <= truncate_length) { @@ -2914,11 +2916,11 @@ __device__ std::pair truncate_byte_array( * valid min or max binary value. String and byte array types will be truncated if they exceed * truncate_length. */ -__device__ std::pair get_extremum(statistics_val const* stats_val, - statistics_dtype dtype, - void* scratch, - bool is_min, - int32_t truncate_length) +__device__ cuda::std::pair get_extremum(statistics_val const* stats_val, + statistics_dtype dtype, + void* scratch, + bool is_min, + int32_t truncate_length) { switch (dtype) { case dtype_bool: return {stats_val, sizeof(bool)}; diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh index dc4140d0a44..ba627e73625 100644 --- a/cpp/src/io/parquet/page_string_utils.cuh +++ b/cpp/src/io/parquet/page_string_utils.cuh @@ -20,6 +20,8 @@ #include +#include + namespace cudf::io::parquet::detail { // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp. @@ -98,21 +100,54 @@ __device__ inline void block_excl_sum(size_type* arr, size_type length, size_typ } } -template -__device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s) +/** + * @brief Converts string sizes to offsets if this is not a large string column. Otherwise, + * atomically update the initial string offset to be used during large string column construction + */ +template +__device__ void convert_small_string_lengths_to_offsets(page_state_s const* const state, + bool has_lists) { // If this is a large string column. In the // latter case, offsets will be computed during string column creation. - auto& ni = s->nesting_info[s->col.max_nesting_depth - 1]; + auto& ni = state->nesting_info[state->col.max_nesting_depth - 1]; int value_count = ni.value_count; // if no repetition we haven't calculated start/end bounds and instead just skipped // values until we reach first_row. account for that here. - if constexpr (!has_lists) { value_count -= s->first_row; } + if (not has_lists) { value_count -= state->first_row; } + + // Convert the array of lengths into offsets + if (value_count > 0) { + auto const offptr = reinterpret_cast(ni.data_out); + auto const initial_value = state->page.str_offset; + block_excl_sum(offptr, value_count, initial_value); + } +} - auto const offptr = reinterpret_cast(ni.data_out); - auto const initial_value = s->page.str_offset; - block_excl_sum(offptr, value_count, initial_value); +/** + * @brief Atomically update the initial string offset to be used during large string column + * construction + */ +inline __device__ void compute_initial_large_strings_offset(page_state_s const* const state, + size_t& initial_str_offset, + bool has_lists) +{ + // Values decoded by this page. + int value_count = state->nesting_info[state->col.max_nesting_depth - 1].value_count; + + // if no repetition we haven't calculated start/end bounds and instead just skipped + // values until we reach first_row. account for that here. + if (not has_lists) { value_count -= state->first_row; } + + // Atomically update the initial string offset if this is a large string column. This initial + // offset will be used to compute (64-bit) offsets during large string column construction. + if (value_count > 0 and threadIdx.x == 0) { + auto const initial_value = state->page.str_offset; + cuda::atomic_ref initial_str_offsets_ref{ + initial_str_offset}; + initial_str_offsets_ref.fetch_min(initial_value, cuda::std::memory_order_relaxed); + } } template diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp index dc0c4b1540e..f7cbe2bd924 100644 --- a/cpp/src/io/parquet/parquet.hpp +++ b/cpp/src/io/parquet/parquet.hpp @@ -20,6 +20,8 @@ #include +#include + #include #include #include @@ -92,10 +94,10 @@ struct LogicalType { BSON }; Type type; - std::optional decimal_type; - std::optional time_type; - std::optional timestamp_type; - std::optional int_type; + cuda::std::optional decimal_type; + cuda::std::optional time_type; + cuda::std::optional timestamp_type; + cuda::std::optional int_type; LogicalType(Type tp = UNDEFINED) : type(tp) {} LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {} @@ -103,36 +105,36 @@ struct LogicalType { LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {} LogicalType(IntType&& it) : type(INTEGER), int_type(it) {} - [[nodiscard]] constexpr bool is_time_millis() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const { return type == TIME and time_type->unit.type == TimeUnit::MILLIS; } - [[nodiscard]] constexpr bool is_time_micros() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const { return type == TIME and time_type->unit.type == TimeUnit::MICROS; } - [[nodiscard]] constexpr bool is_time_nanos() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const { return type == TIME and time_type->unit.type == TimeUnit::NANOS; } - [[nodiscard]] constexpr bool is_timestamp_millis() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const { return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS; } - [[nodiscard]] constexpr bool is_timestamp_micros() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const { return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS; } - [[nodiscard]] constexpr bool is_timestamp_nanos() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const { return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS; } - [[nodiscard]] constexpr int8_t bit_width() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; } @@ -144,7 +146,7 @@ struct LogicalType { return type == DECIMAL ? decimal_type->scale : -1; } - [[nodiscard]] constexpr int32_t precision() const + [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; } diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp index 3c8d32572f8..a78da513b36 100644 --- a/cpp/src/io/parquet/parquet_gpu.hpp +++ b/cpp/src/io/parquet/parquet_gpu.hpp @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -52,7 +53,7 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1; constexpr int LEVEL_DECODE_BUF_SIZE = 2048; template -constexpr int rolling_index(int index) +CUDF_HOST_DEVICE constexpr int rolling_index(int index) { // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never // actual be executed. @@ -78,7 +79,7 @@ constexpr uint8_t REP_LVL_HIST_CUTOFF = 0; constexpr uint8_t DEF_LVL_HIST_CUTOFF = 0; // see setupLocalPageInfo() in page_decode.cuh for supported page encodings -constexpr bool is_supported_encoding(Encoding enc) +CUDF_HOST_DEVICE constexpr bool is_supported_encoding(Encoding enc) { switch (enc) { case Encoding::PLAIN: @@ -96,7 +97,8 @@ constexpr bool is_supported_encoding(Encoding enc) /** * @brief Atomically OR `error` into `error_code`. */ -constexpr void set_error(kernel_error::value_type error, kernel_error::pointer error_code) +__device__ constexpr void set_error(kernel_error::value_type error, + kernel_error::pointer error_code) { if (error != 0) { cuda::atomic_ref ref{*error_code}; @@ -162,14 +164,14 @@ using std::is_scoped_enum; // helpers to do bit operations on scoped enums template || is_scoped_enum::value))> -constexpr std::uint32_t BitAnd(Ts... bits) +CUDF_HOST_DEVICE constexpr std::uint32_t BitAnd(Ts... bits) { return (... & static_cast(bits)); } template || is_scoped_enum::value))> -constexpr std::uint32_t BitOr(Ts... bits) +CUDF_HOST_DEVICE constexpr std::uint32_t BitOr(Ts... bits) { return (... | static_cast(bits)); } @@ -401,7 +403,7 @@ inline auto make_page_key_iterator(device_span pages) * @brief Struct describing a particular chunk of column data */ struct ColumnChunkDesc { - constexpr ColumnChunkDesc() noexcept {}; + CUDF_HOST_DEVICE constexpr ColumnChunkDesc() noexcept {}; explicit ColumnChunkDesc(size_t compressed_size_, uint8_t* compressed_data_, size_t num_values_, @@ -498,8 +500,8 @@ struct parquet_column_device_view : stats_column_desc { int32_t type_length; //!< length of fixed_length_byte_array data uint8_t level_bits; //!< bits to encode max definition (lower nibble) & repetition (upper nibble) //!< levels - [[nodiscard]] constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; } - [[nodiscard]] constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; } + [[nodiscard]] __device__ constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; } + [[nodiscard]] __device__ constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; } uint8_t max_def_level; //!< needed for SizeStatistics calculation uint8_t max_rep_level; @@ -540,7 +542,7 @@ constexpr size_t kDictScratchSize = (1 << kDictHashBits) * sizeof(uint32_t); struct EncPage; // convert Encoding to a mask value -constexpr uint32_t encoding_to_mask(Encoding encoding) +CUDF_HOST_DEVICE constexpr uint32_t encoding_to_mask(Encoding encoding) { return 1 << static_cast(encoding); } @@ -601,9 +603,15 @@ struct EncColumnChunk { uint32_t* rep_histogram_data; //!< Size is (max(level) + 1) * (num_data_pages + 1). size_t var_bytes_size; //!< Sum of var_bytes_size from the pages (byte arrays only) - [[nodiscard]] constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_dict_pages() const + { + return use_dictionary ? 1 : 0; + } - [[nodiscard]] constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_data_pages() const + { + return num_pages - num_dict_pages(); + } }; /** @@ -642,15 +650,21 @@ struct EncPage { Encoding encoding; //!< Encoding used for page data uint16_t num_fragments; //!< Number of fragments in page - [[nodiscard]] constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_v2() const + { + return page_type == PageType::DATA_PAGE_V2; + } - [[nodiscard]] constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto level_bytes() const + { + return def_lvl_bytes + rep_lvl_bytes; + } }; /** * @brief Test if the given column chunk is in a string column */ -constexpr bool is_string_col(ColumnChunkDesc const& chunk) +__device__ constexpr bool is_string_col(ColumnChunkDesc const& chunk) { // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing // a decimal. @@ -862,6 +876,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -870,6 +885,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -884,6 +900,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span pages, * @param[in] num_rows Total number of rows to read * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -892,6 +909,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, size_t num_rows, size_t min_row, int level_type_size, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); @@ -907,6 +925,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span pages, * @param[in] min_row Minimum number of rows to read * @param[in] level_type_size Size in bytes of the type for level decoding * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch. + * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols * @param[out] error_code Error code for kernel failures * @param[in] stream CUDA stream to use */ @@ -916,6 +935,7 @@ void DecodePageData(cudf::detail::hostdevice_span pages, size_t min_row, int level_type_size, decode_kernel_mask kernel_mask, + cudf::device_span initial_str_offsets, kernel_error::pointer error_code, rmm::cuda_stream_view stream); diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index dff1f3f0c0e..9dd4e19de52 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -27,6 +27,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { @@ -210,10 +211,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num } } + // Create an empty device vector to store the initial str offset for large string columns from for + // string decoders. + auto initial_str_offsets = rmm::device_uvector{0, _stream, _mr}; + pass.chunks.host_to_device_async(_stream); chunk_nested_valids.host_to_device_async(_stream); chunk_nested_data.host_to_device_async(_stream); - if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); } + if (has_strings) { + // Host vector to initialize the initial string offsets + auto host_offsets_vector = + cudf::detail::make_host_vector(_input_columns.size(), _stream); + std::fill( + host_offsets_vector.begin(), host_offsets_vector.end(), std::numeric_limits::max()); + // Initialize the initial string offsets vector from the host vector + initial_str_offsets = + cudf::detail::make_device_uvector_async(host_offsets_vector, _stream, _mr); + chunk_nested_str_data.host_to_device_async(_stream); + } // create this before we fork streams kernel_error error_code(_stream); @@ -231,6 +246,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num skip_rows, level_type_size, decoder_mask, + initial_str_offsets, error_code.data(), streams[s_idx++]); }; @@ -287,6 +303,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num num_rows, skip_rows, level_type_size, + initial_str_offsets, error_code.data(), streams[s_idx++]); } @@ -298,6 +315,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num num_rows, skip_rows, level_type_size, + initial_str_offsets, error_code.data(), streams[s_idx++]); } @@ -402,6 +420,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num page_nesting.device_to_host_async(_stream); page_nesting_decode.device_to_host_async(_stream); + // Copy over initial string offsets from device + auto h_initial_str_offsets = cudf::detail::make_host_vector_async(initial_str_offsets, _stream); + if (auto const error = error_code.value_sync(_stream); error != 0) { CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error)); } @@ -440,6 +461,12 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } + // Nested large strings column + else if (input_col.nesting_depth() > 0) { + CUDF_EXPECTS(h_initial_str_offsets[idx] != std::numeric_limits::max(), + "Encountered invalid initial offset for large string column"); + out_buf.set_initial_string_offset(h_initial_str_offsets[idx]); + } } } } diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 933be889b1a..03a37327e9b 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1079,7 +1079,7 @@ struct decomp_sum { { return {a.codec, a.num_pages + b.num_pages, - std::max(a.max_page_decompressed_size, b.max_page_decompressed_size), + cuda::std::max(a.max_page_decompressed_size, b.max_page_decompressed_size), a.total_decompressed_size + b.total_decompressed_size}; } }; diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index 25baa1e0ec8..7d3b6a39d5b 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -30,6 +30,7 @@ #include #include +#include #include namespace cudf::io::parquet::detail { diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index 43666f9e42d..3874346e471 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -649,7 +649,7 @@ void decode_page_headers(pass_intermediate_data& pass, stream.synchronize(); } -constexpr bool is_string_chunk(ColumnChunkDesc const& chunk) +__device__ constexpr bool is_string_chunk(ColumnChunkDesc const& chunk) { auto const is_decimal = chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL; diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh index 3c49de0c997..2de2670b7a7 100644 --- a/cpp/src/io/parquet/rle_stream.cuh +++ b/cpp/src/io/parquet/rle_stream.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,7 +24,7 @@ namespace cudf::io::parquet::detail { template -constexpr int rle_stream_required_run_buffer_size() +__device__ constexpr int rle_stream_required_run_buffer_size() { constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1; return (num_rle_stream_decode_warps * 2); diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu index 028f922bec3..37b1608463b 100644 --- a/cpp/src/io/text/multibyte_split.cu +++ b/cpp/src/io/text/multibyte_split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,7 +66,7 @@ int32_t constexpr ITEMS_PER_TILE = ITEMS_PER_THREAD * THREADS_PER_TILE; int32_t constexpr TILES_PER_CHUNK = 4096; int32_t constexpr ITEMS_PER_CHUNK = ITEMS_PER_TILE * TILES_PER_CHUNK; -constexpr multistate transition_init(char c, cudf::device_span delim) +__device__ constexpr multistate transition_init(char c, cudf::device_span delim) { auto result = multistate(); @@ -79,7 +79,9 @@ constexpr multistate transition_init(char c, cudf::device_span delim return result; } -constexpr multistate transition(char c, multistate state, cudf::device_span delim) +__device__ constexpr multistate transition(char c, + multistate state, + cudf::device_span delim) { auto result = multistate(); @@ -182,7 +184,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel( auto const thread_input_offset = tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD; auto const thread_input_size = - std::max(chunk_input_chars.size() - thread_input_offset, 0); + cuda::std::max(chunk_input_chars.size() - thread_input_offset, 0); // STEP 1: Load inputs @@ -257,7 +259,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel( auto const thread_input_offset = tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD; auto const thread_input_size = - std::max(chunk_input_chars.size() - thread_input_offset, 0); + cuda::std::max(chunk_input_chars.size() - thread_input_offset, 0); // STEP 1: Load inputs @@ -555,7 +557,7 @@ std::unique_ptr multibyte_split(cudf::io::text::data_chunk_source if (row == last_row && insert_end) { return thrust::make_pair(chars + begin, len); } else { - return thrust::make_pair(chars + begin, std::max(0, len - delim_size)); + return thrust::make_pair(chars + begin, cuda::std::max(0, len - delim_size)); }; })); return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr); diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp index da19539f509..5a8e3081681 100644 --- a/cpp/src/io/utilities/column_buffer.hpp +++ b/cpp/src/io/utilities/column_buffer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -249,6 +249,8 @@ class inline_column_buffer : public column_buffer_base { void create_string_data(size_t num_bytes, bool is_large_strings_col, rmm::cuda_stream_view stream); + void set_initial_string_offset(size_t offset) { initial_string_offset = offset; } + void* string_data() { return _string_data.data(); } [[nodiscard]] void const* string_data() const { return _string_data.data(); } [[nodiscard]] size_t string_size() const { return _string_data.size(); } @@ -257,6 +259,7 @@ class inline_column_buffer : public column_buffer_base { private: rmm::device_buffer _string_data{}; bool _is_large_strings_col{}; + size_t initial_string_offset{0}; }; using column_buffer = gather_column_buffer; diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu index 66d0a644c12..6befc078bb2 100644 --- a/cpp/src/io/utilities/column_buffer_strings.cu +++ b/cpp/src/io/utilities/column_buffer_strings.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,7 +37,8 @@ std::unique_ptr cudf::io::detail::inline_column_buffer::make_string_colu data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr); auto d_offsets64 = offsets_col->mutable_view().template data(); // it's safe to call with size + 1 because _data is also sized that large - cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream); + cudf::detail::sizes_to_offsets( + offsets_ptr, offsets_ptr + size + 1, d_offsets64, initial_string_offset, stream); return make_strings_column( size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask)); } else { diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 0c49b2e5d78..2750a17d328 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -145,7 +145,7 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str) * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to * \p out_it */ -constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) +__device__ constexpr size_type write_utf8_char(char_utf8 character, char*& out_it) { auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character) : strings::detail::from_char_utf8(character, out_it); diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh index 8183a66f4f0..46a3880df84 100644 --- a/cpp/src/io/utilities/output_builder.cuh +++ b/cpp/src/io/utilities/output_builder.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -59,7 +59,7 @@ class split_device_span { { } - [[nodiscard]] constexpr reference operator[](size_type i) const + [[nodiscard]] __device__ constexpr reference operator[](size_type i) const { return i < _head.size() ? _head[i] : _tail[i - _head.size()]; } diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 9833dab282e..a30ede957ec 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -30,7 +30,10 @@ #include +#include #include +#include +#include #include #include #include @@ -158,7 +161,7 @@ __device__ __forceinline__ thrust::pair get_escaped_char(char escape * @return uint8_t Numeric value of the character, or `0` */ template -constexpr uint8_t decode_digit(char c, bool* valid_flag) +__device__ constexpr uint8_t decode_digit(char c, bool* valid_flag) { if (c >= '0' && c <= '9') return c - '0'; if constexpr (as_hex and std::is_integral_v) { @@ -210,9 +213,9 @@ CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end) * @return The parsed and converted value */ template -__host__ __device__ cuda::std::optional parse_numeric(char const* begin, - char const* end, - parse_options_view const& opts) +CUDF_HOST_DEVICE cuda::std::optional parse_numeric(char const* begin, + char const* end, + parse_options_view const& opts) { T value{}; bool all_digits_valid = true; @@ -222,8 +225,8 @@ __host__ __device__ cuda::std::optional parse_numeric(char const* begin, int32_t sign = (*begin == '-') ? -1 : 1; // Handle infinity - if (std::is_floating_point_v && is_infinity(begin, end)) { - return sign * std::numeric_limits::infinity(); + if (cuda::std::is_floating_point_v && is_infinity(begin, end)) { + return sign * cuda::std::numeric_limits::infinity(); } if (*begin == '-' || *begin == '+') begin++; @@ -244,7 +247,7 @@ __host__ __device__ cuda::std::optional parse_numeric(char const* begin, ++begin; } - if (std::is_floating_point_v) { + if (cuda::std::is_floating_point_v) { // Handle fractional part of the number if necessary double divisor = 1; while (begin < end) { @@ -449,7 +452,7 @@ __inline__ __device__ It skip_character(It const& it, char ch) * * @return Trimmed range */ -__inline__ __device__ std::pair trim_whitespaces_quotes( +__inline__ __device__ cuda::std::pair trim_whitespaces_quotes( char const* begin, char const* end, char quotechar = '\0') { auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; @@ -471,8 +474,8 @@ __inline__ __device__ std::pair trim_whitespaces_quote * * @return Trimmed range */ -__inline__ __device__ std::pair trim_whitespaces(char const* begin, - char const* end) +__inline__ __device__ cuda::std::pair trim_whitespaces(char const* begin, + char const* end) { auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); }; @@ -495,9 +498,9 @@ __inline__ __device__ std::pair trim_whitespaces(char * * @return Trimmed range */ -__inline__ __device__ std::pair trim_quotes(char const* begin, - char const* end, - char quotechar) +__inline__ __device__ cuda::std::pair trim_quotes(char const* begin, + char const* end, + char quotechar) { if ((thrust::distance(begin, end) >= 2 && *begin == quotechar && *thrust::prev(end) == quotechar)) { diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu index 4b50bf626f2..a98f3021da5 100644 --- a/cpp/src/lists/sequences.cu +++ b/cpp/src/lists/sequences.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -167,7 +167,7 @@ std::unique_ptr sequences(column_view const& starts, thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin); auto const n_elements = cudf::detail::sizes_to_offsets( - offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, stream); + offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, 0, stream); CUDF_EXPECTS(n_elements <= std::numeric_limits::max(), "Size of output exceeds the column size limit", std::overflow_error); diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 247440212d0..58c94b60718 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -348,7 +348,7 @@ std::pair, rmm::device_uvector> hash_subs count_substrings_kernel<<>>( *d_strings, width, offsets.data()); auto const total_hashes = - cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream); + cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), 0, stream); // hash substrings rmm::device_uvector hashes(total_hashes, stream); diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp index 39cd783de00..5d2db84ae2e 100644 --- a/cpp/tests/large_strings/parquet_tests.cpp +++ b/cpp/tests/large_strings/parquet_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ #include "large_strings_fixture.hpp" +#include +#include #include #include @@ -143,3 +145,96 @@ TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings) // Verify that we read exactly two table chunks EXPECT_EQ(tables.size(), 2); } + +TEST_F(ParquetStringsTest, ChunkedReadNestedLargeStrings) +{ + using int32s_col = cudf::test::fixed_width_column_wrapper; + using strings_col = cudf::test::strings_column_wrapper; + using structs_col = cudf::test::structs_column_wrapper; + + auto constexpr num_rows = 100'000; + + std::vector> input_columns; + auto const int_iter = thrust::make_counting_iterator(0); + input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release()); + + auto const str_iter = cudf::detail::make_counting_transform_iterator( + 0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); }); + input_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release()); + + auto offsets = std::vector{}; + offsets.reserve(num_rows * 2); + cudf::size_type num_structs = 0; + for (int i = 0; i < num_rows; ++i) { + offsets.push_back(num_structs); + auto const new_list_size = i % 4; + num_structs += new_list_size; + } + offsets.push_back(num_structs); + + auto const make_structs_col = [=] { + auto child1 = int32s_col(int_iter, int_iter + num_structs); + auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2); + auto child3 = strings_col{str_iter, str_iter + num_structs}; + + return structs_col{{child1, child2, child3}}.release(); + }; + + input_columns.emplace_back( + cudf::make_lists_column(static_cast(offsets.size() - 1), + int32s_col(offsets.begin(), offsets.end()).release(), + make_structs_col(), + 0, + rmm::device_buffer{})); + + // Input table + auto const table = cudf::table{std::move(input_columns)}; + auto const expected = table.view(); + + auto const child3_view = expected.column(2).child(1).child(2); // list> + auto const column_size = + cudf::strings_column_view(child3_view).chars_size(cudf::get_default_stream()); + // set smaller threshold to reduce file size and execution time + auto const threshold = + column_size / 16; // Empirically set to get a mix of 32 and 64 bit string col chunks. + setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1); + + // Host buffer to write Parquet + auto buffer = std::vector{}; + // Writer options + cudf::io::parquet_writer_options out_opts = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected) + .max_page_size_bytes(512 * 1024) + .max_page_size_rows(20000) + .dictionary_policy(cudf::io::dictionary_policy::ALWAYS) + .write_v2_headers(false); + + // Write to Parquet + cudf::io::write_parquet(out_opts); + + // Reader options + cudf::io::parquet_reader_options in_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size())); + + auto constexpr chunk_read_limit = size_t{1} * 1024 * 1024; + auto constexpr pass_read_limit = 0; + // Chunked parquet reader + auto reader = cudf::io::chunked_parquet_reader(chunk_read_limit, pass_read_limit, in_opts); + + // Read chunked + auto tables = std::vector>{}; + while (reader.has_next()) { + tables.emplace_back(reader.read_chunk().tbl); + } + auto table_views = std::vector{}; + std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) { + return tbl->view(); + }); + auto result = cudf::concatenate(table_views); + + // Verify tables to be equal + CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected); + + // go back to normal threshold + unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD"); +} diff --git a/dependencies.yaml b/dependencies.yaml index 54da3d98d09..30d477c91be 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -777,7 +777,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - polars>=1.11,<1.18 + - polars>=1.20,<1.22 run_cudf_polars_experimental: common: - output_types: [conda, requirements, pyproject] diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py index 0c3159c73d6..a145cd770f9 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 # TODO: remove need for this # ruff: noqa: D101 @@ -58,6 +58,7 @@ class Name(IntEnum): OrdinalDay = auto() Quarter = auto() ReplaceTimeZone = auto() + Replace = auto() Round = auto() Second = auto() Time = auto() diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py index 256840c1f3d..a1c98a2ce1b 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 # TODO: remove need for this # ruff: noqa: D101 @@ -41,7 +41,7 @@ class Name(IntEnum): ConcatHorizontal = auto() ConcatVertical = auto() Contains = auto() - ContainsMany = auto() + ContainsAny = auto() CountMatches = auto() EndsWith = auto() EscapeRegex = auto() @@ -57,6 +57,7 @@ class Name(IntEnum): LenBytes = auto() LenChars = auto() Lowercase = auto() + Normalize = auto() PadEnd = auto() PadStart = auto() Replace = auto() diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index fd56329a48e..74f026e57cd 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -31,7 +31,6 @@ from cudf_polars.dsl.nodebase import Node from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter from cudf_polars.utils import dtypes -from cudf_polars.utils.versions import POLARS_VERSION_GT_112 if TYPE_CHECKING: from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence @@ -628,12 +627,7 @@ def slice_skip(tbl: plc.Table): ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index - if POLARS_VERSION_GT_112: - # If we sliced away some data from the start, that - # shifts the row index. - # But prior to 1.13, polars had this wrong, so we match behaviour - # https://github.com/pola-rs/polars/issues/19607 - offset += skip_rows + offset += skip_rows dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) @@ -763,11 +757,7 @@ def do_evaluate( c.obj.type() == dtype for c, dtype in zip(df.columns, schema.values(), strict=True) ) - if predicate is not None: - (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows) - return df.filter(mask) - else: - return df + return df class Select(IR): @@ -1107,7 +1097,7 @@ class Join(IR): right_on: tuple[expr.NamedExpr, ...] """List of expressions used as keys in the right frame.""" options: tuple[ - Literal["inner", "left", "right", "full", "semi", "anti", "cross"], + Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"], bool, tuple[int, int] | None, str, @@ -1142,50 +1132,45 @@ def __init__( # TODO: Implement maintain_order if options[5] != "none": raise NotImplementedError("maintain_order not implemented yet") - if any( - isinstance(e.value, expr.Literal) - for e in itertools.chain(self.left_on, self.right_on) - ): - raise NotImplementedError("Join with literal as join key.") @staticmethod @cache def _joiners( - how: Literal["inner", "left", "right", "full", "semi", "anti"], + how: Literal["Inner", "Left", "Right", "Full", "Semi", "Anti"], ) -> tuple[ Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None ]: - if how == "inner": + if how == "Inner": return ( plc.join.inner_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.DONT_CHECK, ) - elif how == "left" or how == "right": + elif how == "Left" or how == "Right": return ( plc.join.left_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "full": + elif how == "Full": return ( plc.join.full_join, plc.copying.OutOfBoundsPolicy.NULLIFY, plc.copying.OutOfBoundsPolicy.NULLIFY, ) - elif how == "semi": + elif how == "Semi": return ( plc.join.left_semi_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - elif how == "anti": + elif how == "Anti": return ( plc.join.left_anti_join, plc.copying.OutOfBoundsPolicy.DONT_CHECK, None, ) - assert_never(how) + assert_never(how) # pragma: no cover @staticmethod def _reorder_maps( @@ -1246,7 +1231,7 @@ def do_evaluate( left_on_exprs: Sequence[expr.NamedExpr], right_on_exprs: Sequence[expr.NamedExpr], options: tuple[ - Literal["inner", "left", "right", "full", "semi", "anti", "cross"], + Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"], bool, tuple[int, int] | None, str, @@ -1258,7 +1243,7 @@ def do_evaluate( ) -> DataFrame: """Evaluate and return a dataframe.""" how, join_nulls, zlice, suffix, coalesce, _ = options - if how == "cross": + if how == "Cross": # Separate implementation, since cross_join returns the # result, not the gather maps columns = plc.join.cross_join(left.table, right.table).columns() @@ -1295,25 +1280,32 @@ def do_evaluate( table = plc.copying.gather(left.table, lg, left_policy) result = DataFrame.from_table(table, left.column_names) else: - if how == "right": + if how == "Right": # Right join is a left join with the tables swapped left, right = right, left left_on, right_on = right_on, left_on lg, rg = join_fn(left_on.table, right_on.table, null_equality) - if how == "left" or how == "right": + if how == "Left" or how == "Right": # Order of left table is preserved lg, rg = cls._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy ) - if coalesce and how == "inner": - right = right.discard_columns(right_on.column_names_set) + if coalesce: + if how == "Full": + # In this case, keys must be column references, + # possibly with dtype casting. We should use them in + # preference to the columns from the original tables. + left = left.with_columns(left_on.columns, replace_only=True) + right = right.with_columns(right_on.columns, replace_only=True) + else: + right = right.discard_columns(right_on.column_names_set) left = DataFrame.from_table( plc.copying.gather(left.table, lg, left_policy), left.column_names ) right = DataFrame.from_table( plc.copying.gather(right.table, rg, right_policy), right.column_names ) - if coalesce and how != "inner": + if coalesce and how == "Full": left = left.with_columns( ( Column( @@ -1329,7 +1321,7 @@ def do_evaluate( replace_only=True, ) right = right.discard_columns(right_on.column_names_set) - if how == "right": + if how == "Right": # Undo the swap for right join before gluing together. left, right = right, left right = right.rename_columns( @@ -1374,7 +1366,9 @@ def do_evaluate( """Evaluate and return a dataframe.""" columns = [c.evaluate(df) for c in exprs] if should_broadcast: - columns = broadcast(*columns, target_length=df.num_rows) + columns = broadcast( + *columns, target_length=df.num_rows if df.num_columns != 0 else None + ) else: # Polars ensures this is true, but let's make sure nothing # went wrong. In this case, the parent node is a @@ -1769,8 +1763,6 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR) self._non_child_args = (zlice,) self.children = children schema = self.children[0].schema - if not all(s.schema == schema for s in self.children[1:]): - raise NotImplementedError("Schema mismatch") @classmethod def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame: diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py index 2138ac0c700..640fc8d81c5 100644 --- a/python/cudf_polars/cudf_polars/dsl/translate.py +++ b/python/cudf_polars/cudf_polars/dsl/translate.py @@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR: # IR is versioned with major.minor, minor is bumped for backwards # compatible changes (e.g. adding new nodes), major is bumped for # incompatible changes (e.g. renaming nodes). - if (version := self.visitor.version()) >= (4, 3): + if (version := self.visitor.version()) >= (5, 1): e = NotImplementedError( f"No support for polars IR {version=}" ) # pragma: no cover; no such version for now. @@ -302,25 +302,52 @@ def _( # Join key dtypes are dependent on the schema of the left and # right inputs, so these must be translated with the relevant # input active. + def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal: + if literal.dtype.id() == plc.types.TypeId.INT32: + plc_int64 = plc.types.DataType(plc.types.TypeId.INT64) + return expr.Literal( + plc_int64, + pa.scalar(literal.value.as_py(), type=plc.interop.to_arrow(plc_int64)), + ) + return literal + + def maybe_adjust_binop(e) -> expr.Expr: + if isinstance(e.value, expr.BinOp): + left, right = e.value.children + if isinstance(left, expr.Col) and isinstance(right, expr.Literal): + e.value.children = (left, adjust_literal_dtype(right)) + elif isinstance(left, expr.Literal) and isinstance(right, expr.Col): + e.value.children = (adjust_literal_dtype(left), right) + return e + + def translate_expr_and_maybe_fix_binop_args(translator, exprs): + return [ + maybe_adjust_binop(translate_named_expr(translator, n=e)) for e in exprs + ] + with set_node(translator.visitor, node.input_left): inp_left = translator.translate_ir(n=None) - left_on = [translate_named_expr(translator, n=e) for e in node.left_on] + # TODO: There's bug in the polars type coercion phase. Use + # translate_named_expr directly once it is resolved. + # Tracking issue: https://github.com/pola-rs/polars/issues/20935 + left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on) with set_node(translator.visitor, node.input_right): inp_right = translator.translate_ir(n=None) - right_on = [translate_named_expr(translator, n=e) for e in node.right_on] + right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on) + if (how := node.options[0]) in { - "inner", - "left", - "right", - "full", - "cross", - "semi", - "anti", + "Inner", + "Left", + "Right", + "Full", + "Cross", + "Semi", + "Anti", }: return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right) else: - how, op1, op2 = how - if how != "ie_join": + how, op1, op2 = node.options[0] + if how != "IEJoin": raise NotImplementedError( f"Unsupported join type {how}" ) # pragma: no cover; asof joins not yet exposed diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py index e453a8b89b9..0b52cf1c61c 100644 --- a/python/cudf_polars/cudf_polars/testing/plugin.py +++ b/python/cudf_polars/cudf_polars/testing/plugin.py @@ -8,9 +8,7 @@ from functools import partialmethod from typing import TYPE_CHECKING -import fastexcel import pytest -from packaging import version import polars @@ -124,6 +122,9 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR", + "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match", + "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match", "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR", "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR", "tests/unit/io/test_write.py::test_write_async[-write_csv]": "Need to add include_file_path to IR", @@ -178,6 +179,7 @@ def pytest_configure(config: pytest.Config) -> None: "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype", "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg", + "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg", "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg", "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852", "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins", @@ -194,10 +196,6 @@ def pytest_configure(config: pytest.Config) -> None: # Maybe flaky, order-dependent? "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order", "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero", - "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": ( - "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698", - version.parse(fastexcel.__version__) >= version.parse("0.12.1"), - ), } @@ -211,6 +209,11 @@ def pytest_configure(config: pytest.Config) -> None: # polars that the requested timezone is unknown. # Since this is random, just skip it, rather than xfailing. "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names", + # The test may segfault with the legacy streaming engine. We should + # remove this skip when all polars tests use the new streaming engine. + "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine", + # Fails in CI, but passes locally + "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread", } @@ -233,4 +236,7 @@ def pytest_collection_modifyitems( reason=EXPECTED_FAILURES[item.nodeid][0], ), ) - item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])) + else: + item.add_marker( + pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]) + ) diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py index b08cede8f7f..e9d735bdf72 100644 --- a/python/cudf_polars/cudf_polars/utils/versions.py +++ b/python/cudf_polars/cudf_polars/utils/versions.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 """Version utilities so that cudf_polars supports a range of polars versions.""" @@ -12,14 +12,11 @@ POLARS_VERSION = parse(__version__) -POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11") -POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12") -POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12") -POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13") +POLARS_VERSION_LT_120 = POLARS_VERSION < parse("1.20") def _ensure_polars_version(): - if POLARS_VERSION_LT_111: + if POLARS_VERSION_LT_120: raise ImportError( - "cudf_polars requires py-polars v1.11 or greater." + "cudf_polars requires py-polars v1.20 or greater." ) # pragma: no cover diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index 9fb9bbf391e..15547f85d56 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -19,7 +19,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "polars>=1.11,<1.18", + "polars>=1.20,<1.22", "pylibcudf==25.2.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py index f1f47bfb9f1..4dbf972dc9f 100644 --- a/python/cudf_polars/tests/test_join.py +++ b/python/cudf_polars/tests/test_join.py @@ -7,13 +7,11 @@ import pytest import polars as pl -from polars.testing import assert_frame_equal from cudf_polars.testing.asserts import ( assert_gpu_result_equal, assert_ir_translation_raises, ) -from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"]) @@ -96,15 +94,7 @@ def test_left_join_with_slice(left, right, join_nulls, zlice): q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True) ctx = nullcontext() if zlice is not None: - q_expect = q.collect().slice(*zlice) q = q.slice(*zlice) - if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)): - # https://github.com/pola-rs/polars/issues/19403 - # https://github.com/pola-rs/polars/issues/19405 - ctx = pytest.raises(AssertionError) - assert_frame_equal( - q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True)) - ) with ctx: assert_gpu_result_equal(q) @@ -125,28 +115,25 @@ def test_cross_join(left, right, zlice): (pl.lit(2, dtype=pl.Int64), pl.col("a")), ], ) -def test_join_literal_key_unsupported(left, right, left_on, right_on): +def test_join_literal_key(left, right, left_on, right_on): q = left.join(right, left_on=left_on, right_on=right_on, how="inner") - - assert_ir_translation_raises(q, NotImplementedError) + assert_gpu_result_equal(q) @pytest.mark.parametrize( "conditions", [ [pl.col("a") < pl.col("a_right")], - [pl.col("a_right") <= pl.col("a") * 2], + [ + pl.col("a_right") <= pl.col("a") * 2, + pl.col("a_right") <= 2 * pl.col("a"), + ], [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")], [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")], - pytest.param( - [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], - marks=pytest.mark.xfail( - POLARS_VERSION_LT_113, - reason="https://github.com/pola-rs/polars/issues/19597", - ), - ), + [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2], ], ) +@pytest.mark.parametrize("zlice", [None, (0, 5)]) def test_join_where(left, right, conditions, zlice): q = left.join_where(right, *conditions) diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py index 865b95a7d91..de75900f8c0 100644 --- a/python/cudf_polars/tests/test_union.py +++ b/python/cudf_polars/tests/test_union.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -6,7 +6,6 @@ from cudf_polars.testing.asserts import ( assert_gpu_result_equal, - assert_ir_translation_raises, ) @@ -22,19 +21,6 @@ def test_union(): assert_gpu_result_equal(query) -def test_union_schema_mismatch_raises(): - ldf = pl.DataFrame( - { - "a": [1, 2, 3, 4, 5, 6, 7], - "b": [1, 1, 1, 1, 1, 1, 1], - } - ).lazy() - ldf2 = ldf.select(pl.col("a").cast(pl.Float32)) - query = pl.concat([ldf, ldf2], how="diagonal") - - assert_ir_translation_raises(query, NotImplementedError) - - def test_concat_vertical(): ldf = pl.LazyFrame( {