diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 65aebfb7f8c..f6b3fb83cdd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -90,7 +90,7 @@ jobs:
       package-name: libcudf
       package-type: cpp
   wheel-build-pylibcudf:
-    needs: [wheel-publish-libcudf]
+    needs: [wheel-build-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -111,7 +111,7 @@ jobs:
       package-name: pylibcudf
       package-type: python
   wheel-build-cudf:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -132,7 +132,7 @@ jobs:
       package-name: cudf
       package-type: python
   wheel-build-dask-cudf:
-    needs: wheel-publish-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -155,7 +155,7 @@ jobs:
       package-name: dask_cudf
       package-type: python
   wheel-build-cudf-polars:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d99b74506e4..965b667605c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   autoupdate_branch: ""
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: quarterly
-  skip: ["verify-alpha-spec"]
+  skip: ["verify-alpha-spec", "nbqa-isort"]
   submodules: false
 
 repos:
@@ -173,7 +173,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.16.0
+    rev: v1.17.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index dbb44890965..cc01f5286ef 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.18
+- polars>=1.20,<1.22
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<20.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 1b674596a4b..d52cb85abe6 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -65,7 +65,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.11,<1.18
+- polars>=1.20,<1.22
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index 7a0005497df..fb7ab9332d8 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.11,<1.18
+    - polars >=1.20,<1.22
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 32bd945d57c..83e6c35216a 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -121,6 +121,10 @@ void BM_parquet_read_long_strings(nvbench::state& state)
       cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);  // THIS
     auto const view = tbl->view();
 
+    // set smaller threshold to reduce file size and execution time
+    auto const threshold = 1;
+    setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
     cudf::io::parquet_writer_options write_opts =
       cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
         .compression(compression);
@@ -129,6 +133,7 @@ void BM_parquet_read_long_strings(nvbench::state& state)
   }();
 
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }
 
 template <data_type DataType>
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 88ec0c07dc5..358170f76db 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -255,12 +255,14 @@ static sizes_to_offsets_iterator<ScanIterator, LastType> make_sizes_to_offsets_i
  * @param begin Input iterator for scan
  * @param end End of the input iterator
  * @param result Output iterator for scan result
+ * @param initial_offset Initial offset to add to scan
  * @return The last element of the scan
  */
 template <typename SizesIterator, typename OffsetsIterator>
 auto sizes_to_offsets(SizesIterator begin,
                       SizesIterator end,
                       OffsetsIterator result,
+                      int64_t initial_offset,
                       rmm::cuda_stream_view stream)
 {
   using SizeType = typename thrust::iterator_traits<SizesIterator>::value_type;
@@ -273,7 +275,8 @@ auto sizes_to_offsets(SizesIterator begin,
     make_sizes_to_offsets_iterator(result, result + std::distance(begin, end), last_element.data());
   // This function uses the type of the initialization parameter as the accumulator type
   // when computing the individual scan output elements.
-  thrust::exclusive_scan(rmm::exec_policy(stream), begin, end, output_itr, LastType{0});
+  thrust::exclusive_scan(
+    rmm::exec_policy_nosync(stream), begin, end, output_itr, static_cast<LastType>(initial_offset));
   return last_element.value(stream);
 }
 
@@ -319,7 +322,8 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements = sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, stream);
+  auto const total_elements =
+    sizes_to_offsets(input_itr, input_itr + count + 1, d_offsets, 0, stream);
   CUDF_EXPECTS(
     total_elements <= static_cast<decltype(total_elements)>(std::numeric_limits<size_type>::max()),
     "Size of output exceeds the column size limit",
diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 44a86f1c84f..135f645817e 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -73,7 +73,7 @@ CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_down_safe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
@@ -113,16 +113,16 @@ CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexc
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
+CUDF_HOST_DEVICE constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                 I dividend,
-                                 I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, false>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
@@ -130,7 +130,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
 }
 
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, true>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -156,9 +158,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
  * the non-integral division `dividend/divisor`
  */
 template <typename I>
-constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
-  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed_v<I>>;
+  using i_is_a_signed_type = cuda::std::integral_constant<bool, cuda::std::is_signed_v<I>>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 2bafe235058..643d1d07cb7 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ auto to_string(T value) -> std::string
 }
 
 template <typename T>
-constexpr auto abs(T value)
+CUDF_HOST_DEVICE constexpr auto abs(T value)
 {
   return value >= 0 ? value : -value;
 }
@@ -72,7 +72,7 @@ CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 }
 
 template <typename BaseType>
-constexpr auto exp10(int32_t exponent)
+CUDF_HOST_DEVICE constexpr auto exp10(int32_t exponent)
 {
   BaseType value = 1;
   while (exponent > 0)
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index 32187b43d34..24b8738d5dd 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/export.hpp>
 
+#include <cuda/functional>
+
 #include <cstdint>
 
 namespace CUDF_EXPORT cudf {
@@ -45,7 +47,7 @@ struct multistate {
    *
    * @note: The behavior of this function is undefined if size() => max_segment_count
    */
-  constexpr void enqueue(uint8_t head, uint8_t tail)
+  CUDF_HOST_DEVICE constexpr void enqueue(uint8_t head, uint8_t tail)
   {
     _heads |= (head & 0xFu) << (_size * 4);
     _tails |= (tail & 0xFu) << (_size * 4);
@@ -55,17 +57,17 @@ struct multistate {
   /**
    * @brief get's the number of segments this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t size() const { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t size() const { return _size; }
 
   /**
    * @brief get's the highest (____, tail] value this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t max_tail() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t max_tail() const
   {
     uint8_t maximum = 0;
 
     for (uint8_t i = 0; i < _size; i++) {
-      maximum = std::max(maximum, get_tail(i));
+      maximum = cuda::std::max(maximum, get_tail(i));
     }
 
     return maximum;
@@ -74,7 +76,7 @@ struct multistate {
   /**
    * @brief get's the Nth (head, ____] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_head(uint8_t idx) const
   {
     return (_heads >> (idx * 4)) & 0xFu;
   }
@@ -82,7 +84,7 @@ struct multistate {
   /**
    * @brief get's the Nth (____, tail] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_tail(uint8_t idx) const
   {
     return (_tails >> (idx * 4)) & 0xFu;
   }
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 8440805960e..5ae4af411b6 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/fixed_point/temporary.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
 #include <cuda/std/type_traits>
 #include <thrust/pair.h>
@@ -46,7 +47,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
   // highest value where another decimal digit cannot be appended without an overflow;
   // this preserves the most digits when scaling the final result for this type
   constexpr UnsignedDecimalType decimal_max =
-    (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
+    (cuda::std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
   __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
@@ -90,7 +91,8 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
 template <bool check_only = false>
 __device__ cuda::std::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
 {
-  constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+  constexpr uint32_t exponent_max =
+    static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max());
 
   // get optional exponent sign
   int32_t const exp_sign = [&iter] {
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
index 0ee26ec9ee2..af4a4ce7cd2 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 
+#include <cuda/std/functional>
+
 namespace cudf::strings::detail {
 
 /**
@@ -33,7 +35,7 @@ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
   auto const fraction  = count_digits(abs_value % exp_ten);
-  auto const num_zeros = std::max(0, (-scale - fraction));
+  auto const num_zeros = cuda::std::max(0, (-scale - fraction));
   return static_cast<int32_t>(value < 0) +    // sign if negative
          count_digits(abs_value / exp_ten) +  // integer
          1 +                                  // decimal point
@@ -66,7 +68,7 @@ __device__ inline void fixed_point_to_string(__int128_t const& value, int32_t sc
   if (value < 0) *out_ptr++ = '-';  // add sign
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
-  auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+  auto const num_zeros = cuda::std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
   out_ptr += integer_to_string(abs_value / exp_ten, out_ptr);  // add the integer part
   *out_ptr++ = '.';                                            // add decimal point
diff --git a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
index f6e6a10a864..39b9cd6978c 100644
--- a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
  * @return size_type number of digits in input value
  */
 template <typename IntegerType>
-constexpr size_type count_digits(IntegerType value)
+__device__ constexpr size_type count_digits(IntegerType value)
 {
   if (value == 0) return 1;
   bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index de2f1770e28..cd386ea886f 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,21 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+template <typename Iter>
+struct string_offsets_fn {
+  Iter _begin;
+  size_type _strings_count;
+  constexpr string_offsets_fn(Iter begin, size_type strings_count)
+    : _begin{begin}, _strings_count{strings_count}
+  {
+  }
+
+  __device__ constexpr size_type operator()(size_type idx) const noexcept
+  {
+    return idx < _strings_count ? static_cast<size_type>(_begin[idx]) : size_type{0};
+  };
+};
+
 /**
  * @brief Gather characters to create a strings column using the given string-index pair iterator
  *
@@ -133,14 +148,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   // using exclusive-scan technically requires strings_count+1 input values even though
   // the final input value is never used.
   // The input iterator is wrapped here to allow the 'last value' to be safely read.
-  auto map_fn = cuda::proclaim_return_type<size_type>(
-    [begin, strings_count] __device__(size_type idx) -> size_type {
-      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
-    });
-  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+  auto input_itr =
+    cudf::detail::make_counting_transform_iterator(0, string_offsets_fn{begin, strings_count});
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, 0, stream);
 
   auto const threshold = cudf::strings::get_offset64_threshold();
   CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
@@ -151,7 +163,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
     auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+    cudf::detail::sizes_to_offsets(
+      input_itr, input_itr + strings_count + 1, d_offsets64, 0, stream);
   }
 
   return std::pair(std::move(offsets_column), total_bytes);
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index bfdba238a1e..0463eca65e9 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,7 +197,7 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_d
 /**
  * @brief Checks whether `c` is decimal digit
  */
-constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
+__device__ constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
 
 /**
  * @brief Parses a datetime string and computes the corresponding timestamp.
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1a0c59e365a..1587c4da9c8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -376,6 +376,48 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
     {});
 }
 
+struct scatter_fn {
+  column_device_view _col;
+  size_type* _d_strview_offsets;
+  string_view* _d_strviews;
+  size_type const* _labels;
+  size_type const* _list_offsets;
+  column_device_view _d_strings_children;
+  string_view _element_seperator;
+  string_view _element_narep;
+
+  scatter_fn(column_device_view col,
+             size_type* d_strview_offsets,
+             string_view* d_strviews,
+             size_type const* labels,
+             size_type const* list_offsets,
+             column_device_view d_strings_children,
+             string_view const element_separator,
+             string_view const element_narep) noexcept
+    : _col{col},
+      _d_strview_offsets{d_strview_offsets},
+      _d_strviews{d_strviews},
+      _labels{labels},
+      _list_offsets{list_offsets},
+      _d_strings_children{d_strings_children},
+      _element_seperator{element_separator},
+      _element_narep{element_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const label         = _labels[idx];
+    auto const sublist_index = idx - _list_offsets[label];
+    auto const strview_index = _d_strview_offsets[label] + sublist_index * 2 + 1;
+    // value or na_rep
+    auto const strview         = _d_strings_children.element<cudf::string_view>(idx);
+    _d_strviews[strview_index] = _d_strings_children.is_null(idx) ? _element_narep : strview;
+    // separator
+    if (sublist_index != 0) { _d_strviews[strview_index - 1] = _element_seperator; }
+  }
+};
+
 /**
  * @brief Concatenates a list of strings columns into a single strings column.
  *
@@ -461,24 +503,14 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_strings),
-                   [col                = *col_device_view,
-                    d_strview_offsets  = d_strview_offsets.begin(),
-                    d_strviews         = d_strviews.begin(),
-                    labels             = labels->view().begin<size_type>(),
-                    list_offsets       = offsets.begin<size_type>(),
-                    d_strings_children = *d_strings_children,
-                    element_separator,
-                    element_narep] __device__(auto idx) {
-                     auto const label         = labels[idx];
-                     auto const sublist_index = idx - list_offsets[label];
-                     auto const strview_index = d_strview_offsets[label] + sublist_index * 2 + 1;
-                     // value or na_rep
-                     auto const strview = d_strings_children.element<cudf::string_view>(idx);
-                     d_strviews[strview_index] =
-                       d_strings_children.is_null(idx) ? element_narep : strview;
-                     // separator
-                     if (sublist_index != 0) { d_strviews[strview_index - 1] = element_separator; }
-                   });
+                   scatter_fn{*col_device_view,
+                              d_strview_offsets.data(),
+                              d_strviews.data(),
+                              labels->view().data<size_type>(),
+                              offsets.data<size_type>(),
+                              *d_strings_children,
+                              element_separator,
+                              element_narep});
 
   auto joined_col = make_strings_column(d_strviews, string_view{nullptr, 0}, stream, mr);
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 5ab36fdae8e..8dccf65ef10 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -707,7 +707,7 @@ struct orc_column_device_view : public column_device_view {
 struct rowgroup_rows {
   size_type begin;
   size_type end;
-  [[nodiscard]] constexpr auto size() const noexcept { return end - begin; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; }
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index e01b93262d7..5f4c1e0696d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -22,6 +22,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/utility>
+
 namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
@@ -212,7 +214,7 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r
 }
 
 // Splits a nanosecond timestamp into milliseconds and nanoseconds
-__device__ std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
+__device__ cuda::std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
 {
   auto const ns           = cuda::std::chrono::nanoseconds(nano_count);
   auto const ms_floor     = cuda::std::chrono::floor<cuda::std::chrono::milliseconds>(ns);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index bcdd059bf67..857daeb5856 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
@@ -413,8 +414,8 @@ static __device__ uint32_t IntegerRLE(
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
-      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
+      T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
@@ -448,7 +449,7 @@ static __device__ uint32_t IntegerRLE(
         } else {
           uint32_t range, w;
           // Mode 2 base value cannot be bigger than max int64_t, i.e. the first bit has to be 0
-          if (vmin <= std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
+          if (vmin <= cuda::std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
               (literal_run - 1) * (mode1_w - mode2_w) > 4) {
             s->u.intrle.literal_mode = 2;
             w                        = mode2_w;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 0c739f59b0a..5e23bc5adcc 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
@@ -243,9 +244,9 @@ enum row_entry_state_e {
  */
 static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap)
 {
-  constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
+  constexpr cuda::std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
 
-  std::array<uint32_t, full_order.size()> partial_order;
+  cuda::std::array<uint32_t, full_order.size()> partial_order;
   thrust::copy_if(thrust::seq,
                   full_order.cbegin(),
                   full_order.cend(),
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a0cd126cff0..5c3377a1aeb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -71,7 +71,7 @@
 namespace cudf::io::orc::detail {
 
 template <typename T>
-[[nodiscard]] constexpr int varint_size(T val)
+[[nodiscard]] CUDF_HOST_DEVICE constexpr int varint_size(T val)
 {
   auto len = 1u;
   while (val > 0x7f) {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index b8e72aaac88..023402cbcf6 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -359,10 +359,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  std::optional<T>& val;  // union structs are always wrapped in std::optional
+  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f63a4fb79b9..84f751dea6b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -942,6 +942,7 @@ constexpr bool is_split_decode()
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
  * @param num_rows Maximum number of rows to read
+ * @param initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param error_code Error code to set if an error is encountered
  */
 template <typename level_t, int decode_block_size_t, decode_kernel_mask kernel_mask_t>
@@ -950,6 +951,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
                            device_span<ColumnChunkDesc const> chunks,
                            size_t min_row,
                            size_t num_rows,
+                           cudf::device_span<size_t> initial_str_offsets,
                            kernel_error::pointer error_code)
 {
   constexpr bool has_dict_t     = has_dict<kernel_mask_t>();
@@ -1161,11 +1163,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size_t, 8)
     valid_count = next_valid_count;
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
   if constexpr (has_strings_t) {
-    if (!s->col.is_large_string_col) {
-      convert_small_string_lengths_to_offsets<decode_block_size_t, has_lists_t>(s);
+    // For large strings, update the initial string buffer offset to be used during large string
+    // column construction. Otherwise, convert string sizes to final offsets.
+    if (s->col.is_large_string_col) {
+      compute_initial_large_strings_offset(
+        s, initial_str_offsets[pages[page_idx].chunk_idx], has_lists_t);
+    } else {
+      convert_small_string_lengths_to_offsets<decode_block_size_t>(s, has_lists_t);
     }
   }
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
@@ -1185,6 +1190,7 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                              size_t min_row,
                              int level_type_size,
                              decode_kernel_mask kernel_mask,
+                             cudf::device_span<size_t> initial_str_offsets,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream)
 {
@@ -1199,11 +1205,11 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
     if (level_type_size == 1) {
       gpuDecodePageDataGeneric<uint8_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     } else {
       gpuDecodePageDataGeneric<uint16_t, decode_block_size, mask>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
-          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+          pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
     }
   };
 
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 5b9831668e6..2f402e3c4b8 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ __device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t)
     delta_binary_decoder string_lengths;
     auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end);
     // distance is size of string data
-    return static_cast<size_type>(std::distance(string_start, s->data_end));
+    return static_cast<size_type>(thrust::distance(string_start, s->data_end));
   }
   return 0;
 }
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 1fa05b3a6c2..339a6233c4d 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ struct delta_binary_decoder {
 
   // returns the value stored in the `value` array at index
   // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
-  constexpr zigzag128_t value_at(size_type idx)
+  __device__ constexpr zigzag128_t value_at(size_type idx)
   {
     return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
   }
@@ -113,7 +113,7 @@ struct delta_binary_decoder {
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
-  constexpr uint32_t num_encoded_values(bool all_values)
+  __device__ constexpr uint32_t num_encoded_values(bool all_values)
   {
     return value_count == 0 ? 0 : all_values ? value_count : value_count - 1;
   }
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
index 49f4ccedbf0..56b7c8065ee 100644
--- a/cpp/src/io/parquet/delta_enc.cuh
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 namespace cudf::io::parquet::detail {
 
@@ -57,7 +59,7 @@ constexpr int buffer_size           = 2 * block_size;
 static_assert(block_size % 128 == 0);
 static_assert(values_per_mini_block % 32 == 0);
 
-constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+__device__ constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
 
 // Version of bit packer that can handle up to 64 bits values.
 // T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
@@ -67,8 +69,8 @@ template <typename scratch_type>
 inline __device__ void bitpack_mini_block(
   uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
 {
-  using wide_type =
-    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using wide_type = cuda::std::
+    conditional_t<cuda::std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
   using cudf::detail::warp_size;
   scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
   auto constexpr div          = sizeof(scratch_type) * 8;
@@ -235,7 +237,7 @@ class delta_binary_packer {
     size_type const idx = _current_idx + t;
     T const delta       = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
                                                  _buffer[delta::rolling_idx(idx - 1)])
-                                            : std::numeric_limits<T>::max();
+                                            : cuda::std::numeric_limits<T>::max();
 
     // Find min delta for the block.
     auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min());
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a5023e23cc5..b101733d35e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -28,7 +28,7 @@
 namespace cudf::io::parquet::detail {
 
 struct page_state_s {
-  constexpr page_state_s() noexcept {}
+  CUDF_HOST_DEVICE constexpr page_state_s() noexcept {}
   uint8_t const* data_start{};
   uint8_t const* data_end{};
   uint8_t const* lvl_end{};
@@ -121,7 +121,8 @@ struct null_count_back_copier {
 /**
  * @brief Test if the given page is in a string column
  */
-constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+__device__ constexpr bool is_string_col(PageInfo const& page,
+                                        device_span<ColumnChunkDesc const> chunks)
 {
   if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 0c9d4e77f0c..4c98a08006c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -435,6 +435,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           device_span<ColumnChunkDesc const> chunks,
                           size_t min_row,
                           size_t num_rows,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -579,17 +580,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  // For large strings, update the initial string buffer offset to be used during large string
+  // column construction. Otherwise, convert string sizes to final offsets.
+  if (s->col.is_large_string_col) {
+    compute_initial_large_strings_offset(
+      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+  } else {
+    convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
@@ -603,6 +600,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                                 device_span<ColumnChunkDesc const> chunks,
                                 size_t min_row,
                                 size_t num_rows,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code)
 {
   using cudf::detail::warp_size;
@@ -741,17 +739,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
-  // latter case, offsets will be computed during string column creation.
-  if (not s->col.is_large_string_col) {
-    int value_count = nesting_info_base[leaf_level_index].value_count;
-
-    // if no repetition we haven't calculated start/end bounds and instead just skipped
-    // values until we reach first_row. account for that here.
-    if (!has_repetition) { value_count -= s->first_row; }
-
-    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  // For large strings, update the initial string buffer offset to be used during large string
+  // column construction. Otherwise, convert string sizes to final offsets.
+  if (s->col.is_large_string_col) {
+    compute_initial_large_strings_offset(
+      s, initial_str_offsets[pages[page_idx].chunk_idx], has_repetition);
+  } else {
+    convert_small_string_lengths_to_offsets<decode_block_size>(s, has_repetition);
   }
 
   // finally, copy the string data into place
@@ -797,6 +791,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream)
 {
@@ -807,10 +802,10 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 
@@ -822,6 +817,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream)
 {
@@ -832,10 +828,10 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
 
   if (level_type_size == 1) {
     gpuDecodeDeltaLengthByteArray<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   } else {
     gpuDecodeDeltaLengthByteArray<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+      pages.device_ptr(), chunks, min_row, num_rows, initial_str_offsets, error_code);
   }
 }
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 7dc1255af6f..56d638c68eb 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -32,6 +32,9 @@
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
 #include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/gather.h>
@@ -59,7 +62,7 @@ constexpr int encode_block_size = 128;
 constexpr int rle_buffer_size   = 2 * encode_block_size;
 constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
 
-constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
+__device__ constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // max V1 header size
 // also valid for dict page header (V1 or V2)
@@ -113,7 +116,7 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
+__device__ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
   if (physical_type == FIXED_LEN_BYTE_ARRAY) {
     return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
@@ -127,7 +130,7 @@ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_le
   }
 }
 
-constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
+__device__ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
 {
   if (value_bit_width == 0) return 0;
 
@@ -145,7 +148,7 @@ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_value
 }
 
 // subtract b from a, but return 0 if this would underflow
-constexpr size_t underflow_safe_subtract(size_t a, size_t b)
+__device__ constexpr size_t underflow_safe_subtract(size_t a, size_t b)
 {
   if (b > a) { return 0; }
   return a - b;
@@ -228,7 +231,8 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   __syncthreads();
   // page fragment size must fit in a 32-bit signed integer
-  if (s->frag.fragment_data_size > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
+  if (s->frag.fragment_data_size >
+      static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max())) {
     // TODO need to propagate this error back to the host
     CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
   }
@@ -357,7 +361,7 @@ struct BitwiseOr {
 template <Type PT, typename I>
 __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, void* temp_space)
 {
-  using output_type = std::conditional_t<PT == INT32, int32_t, int64_t>;
+  using output_type = cuda::std::conditional_t<PT == INT32, int32_t, int64_t>;
   __shared__ delta_binary_packer<output_type> packer;
 
   auto const t = threadIdx.x;
@@ -737,7 +741,7 @@ CUDF_KERNEL void __launch_bounds__(128)
           : frag_g.fragment_data_size;
 
       // page fragment size must fit in a 32-bit signed integer
-      if (fragment_data_size > std::numeric_limits<int32_t>::max()) {
+      if (fragment_data_size > cuda::std::numeric_limits<int32_t>::max()) {
         CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
       }
 
@@ -816,7 +820,7 @@ CUDF_KERNEL void __launch_bounds__(128)
             page_size + rle_pad +
             (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size);
           // page size must fit in 32-bit signed integer
-          if (max_data_size > std::numeric_limits<int32_t>::max()) {
+          if (max_data_size > cuda::std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
           }
           // if byte_array then save the variable bytes size
@@ -1321,7 +1325,7 @@ static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
  * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period
  * of hours.
  */
-constexpr auto julian_calendar_epoch_diff()
+__device__ constexpr auto julian_calendar_epoch_diff()
 {
   using namespace cuda::std::chrono;
   using namespace cuda::std::chrono_literals;
@@ -1346,7 +1350,7 @@ __device__ auto julian_days_with_time(int64_t v)
   auto const dur_time_of_day       = dur_total - dur_days;
   auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
   auto const julian_days           = dur_days + ceil<days>(julian_calendar_epoch_diff());
-  return std::make_pair(dur_time_of_day_nanos, julian_days);
+  return cuda::std::pair{dur_time_of_day_nanos, julian_days};
 }
 
 // this has been split out into its own kernel because of the amount of shared memory required
@@ -1711,7 +1715,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                      : 0;
         val_idx  = val_idx_in_leaf_col;
       }
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::make_tuple(is_valid, val_idx);
     }();
 
     cur_val_idx += nvals;
@@ -1950,7 +1954,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       // need to test for use_dictionary because it might be boolean
       uint32_t const val_idx =
         (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::tuple{is_valid, val_idx};
     }();
 
     cur_val_idx += nvals;
@@ -2200,7 +2204,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
         auto const arr_size =
           get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         // the lengths are assumed to be INT32, check for overflow
-        if (arr_size > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        if (arr_size > static_cast<size_t>(cuda::std::numeric_limits<int32_t>::max())) {
           CUDF_UNREACHABLE("byte array size exceeds 2GB");
         }
         v = static_cast<int32_t>(arr_size);
@@ -2641,7 +2645,7 @@ class header_encoder {
       cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST);
     auto const t_num   = static_cast<uint8_t>(type);
     current_header_ptr = cpw_put_uint8(
-      current_header_ptr, static_cast<uint8_t>((std::min(len, size_t{0xfu}) << 4) | t_num));
+      current_header_ptr, static_cast<uint8_t>((cuda::std::min(len, size_t{0xfu}) << 4) | t_num));
     if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); }
     current_field_index = 0;
   }
@@ -2802,10 +2806,8 @@ __device__ bool increment_utf8_at(unsigned char* ptr)
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned char const> span,
-                                                          bool is_min,
-                                                          void* scratch,
-                                                          int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_utf8(
+  device_span<unsigned char const> span, bool is_min, void* scratch, int32_t truncate_length)
 {
   // we know at this point that truncate_length < size_bytes, so
   // there is data at [len]. work backwards until we find
@@ -2842,10 +2844,10 @@ __device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned c
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (is_min) { return {arr.data(), truncate_length}; }
   memcpy(scratch, arr.data(), truncate_length);
@@ -2869,10 +2871,10 @@ __device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t
 /**
  * @brief Attempt to truncate a UTF-8 string to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_string(string_view const& str,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_string(string_view const& str,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or str.size_bytes() <= truncate_length) {
     return {str.data(), str.size_bytes()};
@@ -2893,7 +2895,7 @@ __device__ std::pair<void const*, uint32_t> truncate_string(string_view const& s
 /**
  * @brief Attempt to truncate a binary array to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_byte_array(
+__device__ cuda::std::pair<void const*, uint32_t> truncate_byte_array(
   statistics::byte_array_view const& arr, bool is_min, void* scratch, int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or arr.size_bytes() <= truncate_length) {
@@ -2914,11 +2916,11 @@ __device__ std::pair<void const*, uint32_t> truncate_byte_array(
  * valid min or max binary value.  String and byte array types will be truncated if they exceed
  * truncate_length.
  */
-__device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
-                                                         statistics_dtype dtype,
-                                                         void* scratch,
-                                                         bool is_min,
-                                                         int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
+                                                               statistics_dtype dtype,
+                                                               void* scratch,
+                                                               bool is_min,
+                                                               int32_t truncate_length)
 {
   switch (dtype) {
     case dtype_bool: return {stats_val, sizeof(bool)};
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index dc4140d0a44..ba627e73625 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -20,6 +20,8 @@
 
 #include <cudf/strings/detail/gather.cuh>
 
+#include <cuda/atomic>
+
 namespace cudf::io::parquet::detail {
 
 // stole this from cudf/strings/detail/gather.cuh. modified to run on a single string on one warp.
@@ -98,21 +100,54 @@ __device__ inline void block_excl_sum(size_type* arr, size_type length, size_typ
   }
 }
 
-template <int block_size, bool has_lists>
-__device__ inline void convert_small_string_lengths_to_offsets(page_state_s* s)
+/**
+ * @brief Converts string sizes to offsets if this is not a large string column. Otherwise,
+ * atomically update the initial string offset to be used during large string column construction
+ */
+template <int block_size>
+__device__ void convert_small_string_lengths_to_offsets(page_state_s const* const state,
+                                                        bool has_lists)
 {
   // If this is a large string column. In the
   // latter case, offsets will be computed during string column creation.
-  auto& ni        = s->nesting_info[s->col.max_nesting_depth - 1];
+  auto& ni        = state->nesting_info[state->col.max_nesting_depth - 1];
   int value_count = ni.value_count;
 
   // if no repetition we haven't calculated start/end bounds and instead just skipped
   // values until we reach first_row. account for that here.
-  if constexpr (!has_lists) { value_count -= s->first_row; }
+  if (not has_lists) { value_count -= state->first_row; }
+
+  // Convert the array of lengths into offsets
+  if (value_count > 0) {
+    auto const offptr        = reinterpret_cast<size_type*>(ni.data_out);
+    auto const initial_value = state->page.str_offset;
+    block_excl_sum<block_size>(offptr, value_count, initial_value);
+  }
+}
 
-  auto const offptr        = reinterpret_cast<size_type*>(ni.data_out);
-  auto const initial_value = s->page.str_offset;
-  block_excl_sum<block_size>(offptr, value_count, initial_value);
+/**
+ * @brief Atomically update the initial string offset to be used during large string column
+ * construction
+ */
+inline __device__ void compute_initial_large_strings_offset(page_state_s const* const state,
+                                                            size_t& initial_str_offset,
+                                                            bool has_lists)
+{
+  // Values decoded by this page.
+  int value_count = state->nesting_info[state->col.max_nesting_depth - 1].value_count;
+
+  // if no repetition we haven't calculated start/end bounds and instead just skipped
+  // values until we reach first_row. account for that here.
+  if (not has_lists) { value_count -= state->first_row; }
+
+  // Atomically update the initial string offset if this is a large string column. This initial
+  // offset will be used to compute (64-bit) offsets during large string column construction.
+  if (value_count > 0 and threadIdx.x == 0) {
+    auto const initial_value = state->page.str_offset;
+    cuda::atomic_ref<size_t, cuda::std::thread_scope_device> initial_str_offsets_ref{
+      initial_str_offset};
+    initial_str_offsets_ref.fetch_min(initial_value, cuda::std::memory_order_relaxed);
+  }
 }
 
 template <int block_size>
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index dc0c4b1540e..f7cbe2bd924 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,6 +20,8 @@
 
 #include <cudf/types.hpp>
 
+#include <cuda/std/optional>
+
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -92,10 +94,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  std::optional<DecimalType> decimal_type;
-  std::optional<TimeType> time_type;
-  std::optional<TimestampType> timestamp_type;
-  std::optional<IntType> int_type;
+  cuda::std::optional<DecimalType> decimal_type;
+  cuda::std::optional<TimeType> time_type;
+  cuda::std::optional<TimestampType> timestamp_type;
+  cuda::std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -103,36 +105,36 @@ struct LogicalType {
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
-  [[nodiscard]] constexpr bool is_time_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_time_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_time_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::NANOS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
   }
-  [[nodiscard]] constexpr int8_t bit_width() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
   {
     return type == INTEGER ? int_type->bitWidth : -1;
   }
@@ -144,7 +146,7 @@ struct LogicalType {
     return type == DECIMAL ? decimal_type->scale : -1;
   }
 
-  [[nodiscard]] constexpr int32_t precision() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
   {
     return type == DECIMAL ? decimal_type->precision : -1;
   }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3c8d32572f8..a78da513b36 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -34,6 +34,7 @@
 #include <cuda/atomic>
 #include <cuda_runtime.h>
 
+#include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -52,7 +53,7 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
 template <int rolling_size>
-constexpr int rolling_index(int index)
+CUDF_HOST_DEVICE constexpr int rolling_index(int index)
 {
   // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
   // actual be executed.
@@ -78,7 +79,7 @@ constexpr uint8_t REP_LVL_HIST_CUTOFF = 0;
 constexpr uint8_t DEF_LVL_HIST_CUTOFF = 0;
 
 // see setupLocalPageInfo() in page_decode.cuh for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
+CUDF_HOST_DEVICE constexpr bool is_supported_encoding(Encoding enc)
 {
   switch (enc) {
     case Encoding::PLAIN:
@@ -96,7 +97,8 @@ constexpr bool is_supported_encoding(Encoding enc)
 /**
  * @brief Atomically OR `error` into `error_code`.
  */
-constexpr void set_error(kernel_error::value_type error, kernel_error::pointer error_code)
+__device__ constexpr void set_error(kernel_error::value_type error,
+                                    kernel_error::pointer error_code)
 {
   if (error != 0) {
     cuda::atomic_ref<kernel_error::value_type, cuda::thread_scope_device> ref{*error_code};
@@ -162,14 +164,14 @@ using std::is_scoped_enum;
 // helpers to do bit operations on scoped enums
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitAnd(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitAnd(Ts... bits)
 {
   return (... & static_cast<std::uint32_t>(bits));
 }
 
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitOr(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitOr(Ts... bits)
 {
   return (... | static_cast<std::uint32_t>(bits));
 }
@@ -401,7 +403,7 @@ inline auto make_page_key_iterator(device_span<PageInfo const> pages)
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  constexpr ColumnChunkDesc() noexcept {};
+  CUDF_HOST_DEVICE constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -498,8 +500,8 @@ struct parquet_column_device_view : stats_column_desc {
   int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  [[nodiscard]] constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
-  [[nodiscard]] constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
+  [[nodiscard]] __device__ constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  [[nodiscard]] __device__ constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   uint8_t max_def_level;  //!< needed for SizeStatistics calculation
   uint8_t max_rep_level;
 
@@ -540,7 +542,7 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 struct EncPage;
 
 // convert Encoding to a mask value
-constexpr uint32_t encoding_to_mask(Encoding encoding)
+CUDF_HOST_DEVICE constexpr uint32_t encoding_to_mask(Encoding encoding)
 {
   return 1 << static_cast<uint32_t>(encoding);
 }
@@ -601,9 +603,15 @@ struct EncColumnChunk {
   uint32_t* rep_histogram_data;  //!< Size is (max(level) + 1) * (num_data_pages + 1).
   size_t var_bytes_size;         //!< Sum of var_bytes_size from the pages (byte arrays only)
 
-  [[nodiscard]] constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_dict_pages() const
+  {
+    return use_dictionary ? 1 : 0;
+  }
 
-  [[nodiscard]] constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_data_pages() const
+  {
+    return num_pages - num_dict_pages();
+  }
 };
 
 /**
@@ -642,15 +650,21 @@ struct EncPage {
   Encoding encoding;       //!< Encoding used for page data
   uint16_t num_fragments;  //!< Number of fragments in page
 
-  [[nodiscard]] constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_v2() const
+  {
+    return page_type == PageType::DATA_PAGE_V2;
+  }
 
-  [[nodiscard]] constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto level_bytes() const
+  {
+    return def_lvl_bytes + rep_lvl_bytes;
+  }
 };
 
 /**
  * @brief Test if the given column chunk is in a string column
  */
-constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
   // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing
   // a decimal.
@@ -862,6 +876,7 @@ void DecodeDeltaBinary(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -870,6 +885,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                           size_t num_rows,
                           size_t min_row,
                           int level_type_size,
+                          cudf::device_span<size_t> initial_str_offsets,
                           kernel_error::pointer error_code,
                           rmm::cuda_stream_view stream);
 
@@ -884,6 +900,7 @@ void DecodeDeltaByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -892,6 +909,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
                                 size_t num_rows,
                                 size_t min_row,
                                 int level_type_size,
+                                cudf::device_span<size_t> initial_str_offsets,
                                 kernel_error::pointer error_code,
                                 rmm::cuda_stream_view stream);
 
@@ -907,6 +925,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
  * @param[in] kernel_mask Mask indicating the type of decoding kernel to launch.
+ * @param[out] initial_str_offsets Vector to store the initial offsets for large nested string cols
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -916,6 +935,7 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     size_t min_row,
                     int level_type_size,
                     decode_kernel_mask kernel_mask,
+                    cudf::device_span<size_t> initial_str_offsets,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index dff1f3f0c0e..9dd4e19de52 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -27,6 +27,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
+#include <limits>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
@@ -210,10 +211,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
     }
   }
 
+  // Create an empty device vector to store the initial str offset for large string columns from for
+  // string decoders.
+  auto initial_str_offsets = rmm::device_uvector<size_t>{0, _stream, _mr};
+
   pass.chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
-  if (has_strings) { chunk_nested_str_data.host_to_device_async(_stream); }
+  if (has_strings) {
+    // Host vector to initialize the initial string offsets
+    auto host_offsets_vector =
+      cudf::detail::make_host_vector<size_t>(_input_columns.size(), _stream);
+    std::fill(
+      host_offsets_vector.begin(), host_offsets_vector.end(), std::numeric_limits<size_t>::max());
+    // Initialize the initial string offsets vector from the host vector
+    initial_str_offsets =
+      cudf::detail::make_device_uvector_async(host_offsets_vector, _stream, _mr);
+    chunk_nested_str_data.host_to_device_async(_stream);
+  }
 
   // create this before we fork streams
   kernel_error error_code(_stream);
@@ -231,6 +246,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                    skip_rows,
                    level_type_size,
                    decoder_mask,
+                   initial_str_offsets,
                    error_code.data(),
                    streams[s_idx++]);
   };
@@ -287,6 +303,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                          num_rows,
                          skip_rows,
                          level_type_size,
+                         initial_str_offsets,
                          error_code.data(),
                          streams[s_idx++]);
   }
@@ -298,6 +315,7 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                                num_rows,
                                skip_rows,
                                level_type_size,
+                               initial_str_offsets,
                                error_code.data(),
                                streams[s_idx++]);
   }
@@ -402,6 +420,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   page_nesting.device_to_host_async(_stream);
   page_nesting_decode.device_to_host_async(_stream);
 
+  // Copy over initial string offsets from device
+  auto h_initial_str_offsets = cudf::detail::make_host_vector_async(initial_str_offsets, _stream);
+
   if (auto const error = error_code.value_sync(_stream); error != 0) {
     CUDF_FAIL("Parquet data decode failed with code(s) " + kernel_error::to_string(error));
   }
@@ -440,6 +461,12 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
           out_buffers.emplace_back(static_cast<size_type*>(out_buf.data()) + out_buf.size);
           final_offsets.emplace_back(static_cast<size_type>(col_string_sizes[idx]));
         }
+        // Nested large strings column
+        else if (input_col.nesting_depth() > 0) {
+          CUDF_EXPECTS(h_initial_str_offsets[idx] != std::numeric_limits<size_t>::max(),
+                       "Encountered invalid initial offset for large string column");
+          out_buf.set_initial_string_offset(h_initial_str_offsets[idx]);
+        }
       }
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 933be889b1a..03a37327e9b 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1079,7 +1079,7 @@ struct decomp_sum {
   {
     return {a.codec,
             a.num_pages + b.num_pages,
-            std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
+            cuda::std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
             a.total_decompressed_size + b.total_decompressed_size};
   }
 };
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 25baa1e0ec8..7d3b6a39d5b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -30,6 +30,7 @@
 
 #include <functional>
 #include <numeric>
+#include <optional>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 43666f9e42d..3874346e471 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -649,7 +649,7 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
-constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
 {
   auto const is_decimal =
     chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 3c49de0c997..2de2670b7a7 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 namespace cudf::io::parquet::detail {
 
 template <int num_threads>
-constexpr int rle_stream_required_run_buffer_size()
+__device__ constexpr int rle_stream_required_run_buffer_size()
 {
   constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1;
   return (num_rle_stream_decode_warps * 2);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 028f922bec3..37b1608463b 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ int32_t constexpr ITEMS_PER_TILE   = ITEMS_PER_THREAD * THREADS_PER_TILE;
 int32_t constexpr TILES_PER_CHUNK  = 4096;
 int32_t constexpr ITEMS_PER_CHUNK  = ITEMS_PER_TILE * TILES_PER_CHUNK;
 
-constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -79,7 +79,9 @@ constexpr multistate transition_init(char c, cudf::device_span<char const> delim
   return result;
 }
 
-constexpr multistate transition(char c, multistate state, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition(char c,
+                                           multistate state,
+                                           cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -182,7 +184,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -257,7 +259,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -555,7 +557,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
           if (row == last_row && insert_end) {
             return thrust::make_pair(chars + begin, len);
           } else {
-            return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+            return thrust::make_pair(chars + begin, cuda::std::max<size_type>(0, len - delim_size));
           };
         }));
     return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index da19539f509..5a8e3081681 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -249,6 +249,8 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
   void create_string_data(size_t num_bytes,
                           bool is_large_strings_col,
                           rmm::cuda_stream_view stream);
+  void set_initial_string_offset(size_t offset) { initial_string_offset = offset; }
+
   void* string_data() { return _string_data.data(); }
   [[nodiscard]] void const* string_data() const { return _string_data.data(); }
   [[nodiscard]] size_t string_size() const { return _string_data.size(); }
@@ -257,6 +259,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
  private:
   rmm::device_buffer _string_data{};
   bool _is_large_strings_col{};
+  size_t initial_string_offset{0};
 };
 
 using column_buffer = gather_column_buffer;
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
index 66d0a644c12..6befc078bb2 100644
--- a/cpp/src/io/utilities/column_buffer_strings.cu
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,7 +37,8 @@ std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_colu
       data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
     auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
     // it's safe to call with size + 1 because _data is also sized that large
-    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    cudf::detail::sizes_to_offsets(
+      offsets_ptr, offsets_ptr + size + 1, d_offsets64, initial_string_offset, stream);
     return make_strings_column(
       size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
   } else {
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 0c49b2e5d78..2750a17d328 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
  * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
  * \p out_it
  */
-constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
+__device__ constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
 {
   auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
                                          : strings::detail::from_char_utf8(character, out_it);
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 8183a66f4f0..46a3880df84 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ class split_device_span {
   {
   }
 
-  [[nodiscard]] constexpr reference operator[](size_type i) const
+  [[nodiscard]] __device__ constexpr reference operator[](size_type i) const
   {
     return i < _head.size() ? _head[i] : _tail[i - _head.size()];
   }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 9833dab282e..a30ede957ec 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,7 +30,10 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
@@ -158,7 +161,7 @@ __device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escape
  * @return uint8_t Numeric value of the character, or `0`
  */
 template <typename T, bool as_hex = false>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
+__device__ constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
   if constexpr (as_hex and std::is_integral_v<T>) {
@@ -210,9 +213,9 @@ CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
-                                                         char const* end,
-                                                         parse_options_view const& opts)
+CUDF_HOST_DEVICE cuda::std::optional<T> parse_numeric(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -222,8 +225,8 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
   int32_t sign = (*begin == '-') ? -1 : 1;
 
   // Handle infinity
-  if (std::is_floating_point_v<T> && is_infinity(begin, end)) {
-    return sign * std::numeric_limits<T>::infinity();
+  if (cuda::std::is_floating_point_v<T> && is_infinity(begin, end)) {
+    return sign * cuda::std::numeric_limits<T>::infinity();
   }
   if (*begin == '-' || *begin == '+') begin++;
 
@@ -244,7 +247,7 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
     ++begin;
   }
 
-  if (std::is_floating_point_v<T>) {
+  if (cuda::std::is_floating_point_v<T>) {
     // Handle fractional part of the number if necessary
     double divisor = 1;
     while (begin < end) {
@@ -449,7 +452,7 @@ __inline__ __device__ It skip_character(It const& it, char ch)
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quotes(
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces_quotes(
   char const* begin, char const* end, char quotechar = '\0')
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
@@ -471,8 +474,8 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char const* begin,
-                                                                           char const* end)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces(char const* begin,
+                                                                                 char const* end)
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
 
@@ -495,9 +498,9 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
-                                                                      char const* end,
-                                                                      char quotechar)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                            char const* end,
+                                                                            char quotechar)
 {
   if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
        *thrust::prev(end) == quotechar)) {
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 4b50bf626f2..a98f3021da5 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -167,7 +167,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
   thrust::copy_n(rmm::exec_policy(stream), sizes_input_it, sizes.size(), offsets_begin);
 
   auto const n_elements = cudf::detail::sizes_to_offsets(
-    offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, stream);
+    offsets_begin, offsets_begin + list_offsets->size(), offsets_begin, 0, stream);
   CUDF_EXPECTS(n_elements <= std::numeric_limits<size_type>::max(),
                "Size of output exceeds the column size limit",
                std::overflow_error);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 247440212d0..58c94b60718 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -348,7 +348,7 @@ std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_subs
   count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
     *d_strings, width, offsets.data());
   auto const total_hashes =
-    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), 0, stream);
 
   // hash substrings
   rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
index 39cd783de00..5d2db84ae2e 100644
--- a/cpp/tests/large_strings/parquet_tests.cpp
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/concatenate.hpp>
@@ -143,3 +145,96 @@ TEST_F(ParquetStringsTest, DISABLED_ChunkedReadLargeStrings)
   // Verify that we read exactly two table chunks
   EXPECT_EQ(tables.size(), 2);
 }
+
+TEST_F(ParquetStringsTest, ChunkedReadNestedLargeStrings)
+{
+  using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using strings_col = cudf::test::strings_column_wrapper;
+  using structs_col = cudf::test::structs_column_wrapper;
+
+  auto constexpr num_rows = 100'000;
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  auto const int_iter = thrust::make_counting_iterator(0);
+  input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+  input_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+  auto offsets = std::vector<cudf::size_type>{};
+  offsets.reserve(num_rows * 2);
+  cudf::size_type num_structs = 0;
+  for (int i = 0; i < num_rows; ++i) {
+    offsets.push_back(num_structs);
+    auto const new_list_size = i % 4;
+    num_structs += new_list_size;
+  }
+  offsets.push_back(num_structs);
+
+  auto const make_structs_col = [=] {
+    auto child1 = int32s_col(int_iter, int_iter + num_structs);
+    auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+    auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+    return structs_col{{child1, child2, child3}}.release();
+  };
+
+  input_columns.emplace_back(
+    cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                            int32s_col(offsets.begin(), offsets.end()).release(),
+                            make_structs_col(),
+                            0,
+                            rmm::device_buffer{}));
+
+  // Input table
+  auto const table    = cudf::table{std::move(input_columns)};
+  auto const expected = table.view();
+
+  auto const child3_view = expected.column(2).child(1).child(2);  // list<struct<int,int,string>>
+  auto const column_size =
+    cudf::strings_column_view(child3_view).chars_size(cudf::get_default_stream());
+  // set smaller threshold to reduce file size and execution time
+  auto const threshold =
+    column_size / 16;  // Empirically set to get a mix of 32 and 64 bit string col chunks.
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  // Host buffer to write Parquet
+  auto buffer = std::vector<char>{};
+  // Writer options
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buffer}, expected)
+      .max_page_size_bytes(512 * 1024)
+      .max_page_size_rows(20000)
+      .dictionary_policy(cudf::io::dictionary_policy::ALWAYS)
+      .write_v2_headers(false);
+
+  // Write to Parquet
+  cudf::io::write_parquet(out_opts);
+
+  // Reader options
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(buffer.data(), buffer.size()));
+
+  auto constexpr chunk_read_limit = size_t{1} * 1024 * 1024;
+  auto constexpr pass_read_limit  = 0;
+  // Chunked parquet reader
+  auto reader = cudf::io::chunked_parquet_reader(chunk_read_limit, pass_read_limit, in_opts);
+
+  // Read chunked
+  auto tables = std::vector<std::unique_ptr<cudf::table>>{};
+  while (reader.has_next()) {
+    tables.emplace_back(reader.read_chunk().tbl);
+  }
+  auto table_views = std::vector<cudf::table_view>{};
+  std::transform(tables.begin(), tables.end(), std::back_inserter(table_views), [](auto& tbl) {
+    return tbl->view();
+  });
+  auto result = cudf::concatenate(table_views);
+
+  // Verify tables to be equal
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}
diff --git a/dependencies.yaml b/dependencies.yaml
index 54da3d98d09..30d477c91be 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -777,7 +777,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.11,<1.18
+          - polars>=1.20,<1.22
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
index 0c3159c73d6..a145cd770f9 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/datetime.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -58,6 +58,7 @@ class Name(IntEnum):
         OrdinalDay = auto()
         Quarter = auto()
         ReplaceTimeZone = auto()
+        Replace = auto()
         Round = auto()
         Second = auto()
         Time = auto()
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/string.py b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
index 256840c1f3d..a1c98a2ce1b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/string.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/string.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 # TODO: remove need for this
 # ruff: noqa: D101
@@ -41,7 +41,7 @@ class Name(IntEnum):
         ConcatHorizontal = auto()
         ConcatVertical = auto()
         Contains = auto()
-        ContainsMany = auto()
+        ContainsAny = auto()
         CountMatches = auto()
         EndsWith = auto()
         EscapeRegex = auto()
@@ -57,6 +57,7 @@ class Name(IntEnum):
         LenBytes = auto()
         LenChars = auto()
         Lowercase = auto()
+        Normalize = auto()
         PadEnd = auto()
         PadStart = auto()
         Replace = auto()
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index fd56329a48e..74f026e57cd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -31,7 +31,6 @@
 from cudf_polars.dsl.nodebase import Node
 from cudf_polars.dsl.to_ast import to_ast, to_parquet_filter
 from cudf_polars.utils import dtypes
-from cudf_polars.utils.versions import POLARS_VERSION_GT_112
 
 if TYPE_CHECKING:
     from collections.abc import Callable, Hashable, Iterable, MutableMapping, Sequence
@@ -628,12 +627,7 @@ def slice_skip(tbl: plc.Table):
             )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
-            if POLARS_VERSION_GT_112:
-                # If we sliced away some data from the start, that
-                # shifts the row index.
-                # But prior to 1.13, polars had this wrong, so we match behaviour
-                # https://github.com/pola-rs/polars/issues/19607
-                offset += skip_rows
+            offset += skip_rows
             dtype = schema[name]
             step = plc.interop.from_arrow(
                 pa.scalar(1, type=plc.interop.to_arrow(dtype))
@@ -763,11 +757,7 @@ def do_evaluate(
             c.obj.type() == dtype
             for c, dtype in zip(df.columns, schema.values(), strict=True)
         )
-        if predicate is not None:
-            (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows)
-            return df.filter(mask)
-        else:
-            return df
+        return df
 
 
 class Select(IR):
@@ -1107,7 +1097,7 @@ class Join(IR):
     right_on: tuple[expr.NamedExpr, ...]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+        Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
         bool,
         tuple[int, int] | None,
         str,
@@ -1142,50 +1132,45 @@ def __init__(
         # TODO: Implement maintain_order
         if options[5] != "none":
             raise NotImplementedError("maintain_order not implemented yet")
-        if any(
-            isinstance(e.value, expr.Literal)
-            for e in itertools.chain(self.left_on, self.right_on)
-        ):
-            raise NotImplementedError("Join with literal as join key.")
 
     @staticmethod
     @cache
     def _joiners(
-        how: Literal["inner", "left", "right", "full", "semi", "anti"],
+        how: Literal["Inner", "Left", "Right", "Full", "Semi", "Anti"],
     ) -> tuple[
         Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
     ]:
-        if how == "inner":
+        if how == "Inner":
             return (
                 plc.join.inner_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
             )
-        elif how == "left" or how == "right":
+        elif how == "Left" or how == "Right":
             return (
                 plc.join.left_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "full":
+        elif how == "Full":
             return (
                 plc.join.full_join,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
                 plc.copying.OutOfBoundsPolicy.NULLIFY,
             )
-        elif how == "semi":
+        elif how == "Semi":
             return (
                 plc.join.left_semi_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        elif how == "anti":
+        elif how == "Anti":
             return (
                 plc.join.left_anti_join,
                 plc.copying.OutOfBoundsPolicy.DONT_CHECK,
                 None,
             )
-        assert_never(how)
+        assert_never(how)  # pragma: no cover
 
     @staticmethod
     def _reorder_maps(
@@ -1246,7 +1231,7 @@ def do_evaluate(
         left_on_exprs: Sequence[expr.NamedExpr],
         right_on_exprs: Sequence[expr.NamedExpr],
         options: tuple[
-            Literal["inner", "left", "right", "full", "semi", "anti", "cross"],
+            Literal["Inner", "Left", "Right", "Full", "Semi", "Anti", "Cross"],
             bool,
             tuple[int, int] | None,
             str,
@@ -1258,7 +1243,7 @@ def do_evaluate(
     ) -> DataFrame:
         """Evaluate and return a dataframe."""
         how, join_nulls, zlice, suffix, coalesce, _ = options
-        if how == "cross":
+        if how == "Cross":
             # Separate implementation, since cross_join returns the
             # result, not the gather maps
             columns = plc.join.cross_join(left.table, right.table).columns()
@@ -1295,25 +1280,32 @@ def do_evaluate(
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
-            if how == "right":
+            if how == "Right":
                 # Right join is a left join with the tables swapped
                 left, right = right, left
                 left_on, right_on = right_on, left_on
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            if how == "left" or how == "right":
+            if how == "Left" or how == "Right":
                 # Order of left table is preserved
                 lg, rg = cls._reorder_maps(
                     left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
                 )
-            if coalesce and how == "inner":
-                right = right.discard_columns(right_on.column_names_set)
+            if coalesce:
+                if how == "Full":
+                    # In this case, keys must be column references,
+                    # possibly with dtype casting. We should use them in
+                    # preference to the columns from the original tables.
+                    left = left.with_columns(left_on.columns, replace_only=True)
+                    right = right.with_columns(right_on.columns, replace_only=True)
+                else:
+                    right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
                 plc.copying.gather(left.table, lg, left_policy), left.column_names
             )
             right = DataFrame.from_table(
                 plc.copying.gather(right.table, rg, right_policy), right.column_names
             )
-            if coalesce and how != "inner":
+            if coalesce and how == "Full":
                 left = left.with_columns(
                     (
                         Column(
@@ -1329,7 +1321,7 @@ def do_evaluate(
                     replace_only=True,
                 )
                 right = right.discard_columns(right_on.column_names_set)
-            if how == "right":
+            if how == "Right":
                 # Undo the swap for right join before gluing together.
                 left, right = right, left
             right = right.rename_columns(
@@ -1374,7 +1366,9 @@ def do_evaluate(
         """Evaluate and return a dataframe."""
         columns = [c.evaluate(df) for c in exprs]
         if should_broadcast:
-            columns = broadcast(*columns, target_length=df.num_rows)
+            columns = broadcast(
+                *columns, target_length=df.num_rows if df.num_columns != 0 else None
+            )
         else:
             # Polars ensures this is true, but let's make sure nothing
             # went wrong. In this case, the parent node is a
@@ -1769,8 +1763,6 @@ def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR)
         self._non_child_args = (zlice,)
         self.children = children
         schema = self.children[0].schema
-        if not all(s.schema == schema for s in self.children[1:]):
-            raise NotImplementedError("Schema mismatch")
 
     @classmethod
     def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame:
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 2138ac0c700..640fc8d81c5 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -84,7 +84,7 @@ def translate_ir(self, *, n: int | None = None) -> ir.IR:
         # IR is versioned with major.minor, minor is bumped for backwards
         # compatible changes (e.g. adding new nodes), major is bumped for
         # incompatible changes (e.g. renaming nodes).
-        if (version := self.visitor.version()) >= (4, 3):
+        if (version := self.visitor.version()) >= (5, 1):
             e = NotImplementedError(
                 f"No support for polars IR {version=}"
             )  # pragma: no cover; no such version for now.
@@ -302,25 +302,52 @@ def _(
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
+    def adjust_literal_dtype(literal: expr.Literal) -> expr.Literal:
+        if literal.dtype.id() == plc.types.TypeId.INT32:
+            plc_int64 = plc.types.DataType(plc.types.TypeId.INT64)
+            return expr.Literal(
+                plc_int64,
+                pa.scalar(literal.value.as_py(), type=plc.interop.to_arrow(plc_int64)),
+            )
+        return literal
+
+    def maybe_adjust_binop(e) -> expr.Expr:
+        if isinstance(e.value, expr.BinOp):
+            left, right = e.value.children
+            if isinstance(left, expr.Col) and isinstance(right, expr.Literal):
+                e.value.children = (left, adjust_literal_dtype(right))
+            elif isinstance(left, expr.Literal) and isinstance(right, expr.Col):
+                e.value.children = (adjust_literal_dtype(left), right)
+        return e
+
+    def translate_expr_and_maybe_fix_binop_args(translator, exprs):
+        return [
+            maybe_adjust_binop(translate_named_expr(translator, n=e)) for e in exprs
+        ]
+
     with set_node(translator.visitor, node.input_left):
         inp_left = translator.translate_ir(n=None)
-        left_on = [translate_named_expr(translator, n=e) for e in node.left_on]
+        # TODO: There's bug in the polars type coercion phase. Use
+        # translate_named_expr directly once it is resolved.
+        # Tracking issue: https://github.com/pola-rs/polars/issues/20935
+        left_on = translate_expr_and_maybe_fix_binop_args(translator, node.left_on)
     with set_node(translator.visitor, node.input_right):
         inp_right = translator.translate_ir(n=None)
-        right_on = [translate_named_expr(translator, n=e) for e in node.right_on]
+        right_on = translate_expr_and_maybe_fix_binop_args(translator, node.right_on)
+
     if (how := node.options[0]) in {
-        "inner",
-        "left",
-        "right",
-        "full",
-        "cross",
-        "semi",
-        "anti",
+        "Inner",
+        "Left",
+        "Right",
+        "Full",
+        "Cross",
+        "Semi",
+        "Anti",
     }:
         return ir.Join(schema, left_on, right_on, node.options, inp_left, inp_right)
     else:
-        how, op1, op2 = how
-        if how != "ie_join":
+        how, op1, op2 = node.options[0]
+        if how != "IEJoin":
             raise NotImplementedError(
                 f"Unsupported join type {how}"
             )  # pragma: no cover; asof joins not yet exposed
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index e453a8b89b9..0b52cf1c61c 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -8,9 +8,7 @@
 from functools import partialmethod
 from typing import TYPE_CHECKING
 
-import fastexcel
 import pytest
-from packaging import version
 
 import polars
 
@@ -124,6 +122,9 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_scan.py::test_scan_with_row_index_filter_and_limit[single-parquet-async]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_csv-write_csv]": "Need to add include_file_path to IR",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_parquet-write_parquet]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_csv-write_csv]": "Debug output on stderr doesn't match",
+    "tests/unit/io/test_scan.py::test_scan_include_file_paths[False-scan_ndjson-write_ndjson]": "Debug output on stderr doesn't match",
     "tests/unit/io/test_scan.py::test_scan_include_file_name[False-scan_ndjson-write_ndjson]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[read_parquet-write_parquet]": "Need to add include_file_path to IR",
     "tests/unit/io/test_write.py::test_write_async[<lambda>-write_csv]": "Need to add include_file_path to IR",
@@ -178,6 +179,7 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input15-expected15-input_dtype15-output_dtype15]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_median_by_dtype[input16-expected16-input_dtype16-output_dtype16]": "Unsupported groupby-agg for a particular dtype",
     "tests/unit/operations/test_group_by.py::test_group_by_binary_agg_with_literal": "Incorrect broadcasting of literals in groupby-agg",
+    "tests/unit/operations/test_group_by.py::test_group_by_lit_series": "Incorrect broadcasting of literals in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_aggregated_scalar_elementwise_15602": "Unsupported boolean function/dtype combination in groupby-agg",
     "tests/unit/operations/test_group_by.py::test_schemas[data1-expr1-expected_select1-expected_gb1]": "Mismatching dtypes, needs cudf#15852",
     "tests/unit/operations/test_join.py::test_cross_join_slice_pushdown": "Need to implement slice pushdown for cross joins",
@@ -194,10 +196,6 @@ def pytest_configure(config: pytest.Config) -> None:
     # Maybe flaky, order-dependent?
     "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
-    "tests/unit/io/test_spreadsheet.py::test_write_excel_bytes[calamine]": (
-        "Fails when fastexcel version >= 0.12.1. tracking issue: https://github.com/pola-rs/polars/issues/20698",
-        version.parse(fastexcel.__version__) >= version.parse("0.12.1"),
-    ),
 }
 
 
@@ -211,6 +209,11 @@ def pytest_configure(config: pytest.Config) -> None:
     # polars that the requested timezone is unknown.
     # Since this is random, just skip it, rather than xfailing.
     "tests/unit/lazyframe/test_serde.py::test_lf_serde_roundtrip_binary": "chrono_tz doesn't have all tzdata symlink names",
+    # The test may segfault with the legacy streaming engine. We should
+    # remove this skip when all polars tests use the new streaming engine.
+    "tests/unit/streaming/test_streaming_group_by.py::test_streaming_group_by_literal[1]": "May segfault w/the legacy streaming engine",
+    # Fails in CI, but passes locally
+    "tests/unit/streaming/test_streaming.py::test_streaming_streamable_functions": "RuntimeError: polars_python::sql::PySQLContext is unsendable, but is being dropped on another thread",
 }
 
 
@@ -233,4 +236,7 @@ def pytest_collection_modifyitems(
                         reason=EXPECTED_FAILURES[item.nodeid][0],
                     ),
                 )
-            item.add_marker(pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid]))
+            else:
+                item.add_marker(
+                    pytest.mark.xfail(reason=EXPECTED_FAILURES[item.nodeid])
+                )
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index b08cede8f7f..e9d735bdf72 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Version utilities so that cudf_polars supports a range of polars versions."""
@@ -12,14 +12,11 @@
 
 POLARS_VERSION = parse(__version__)
 
-POLARS_VERSION_LT_111 = POLARS_VERSION < parse("1.11")
-POLARS_VERSION_LT_112 = POLARS_VERSION < parse("1.12")
-POLARS_VERSION_GT_112 = POLARS_VERSION > parse("1.12")
-POLARS_VERSION_LT_113 = POLARS_VERSION < parse("1.13")
+POLARS_VERSION_LT_120 = POLARS_VERSION < parse("1.20")
 
 
 def _ensure_polars_version():
-    if POLARS_VERSION_LT_111:
+    if POLARS_VERSION_LT_120:
         raise ImportError(
-            "cudf_polars requires py-polars v1.11 or greater."
+            "cudf_polars requires py-polars v1.20 or greater."
         )  # pragma: no cover
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 9fb9bbf391e..15547f85d56 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.11,<1.18",
+    "polars>=1.20,<1.22",
     "pylibcudf==25.2.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index f1f47bfb9f1..4dbf972dc9f 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -7,13 +7,11 @@
 import pytest
 
 import polars as pl
-from polars.testing import assert_frame_equal
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
-from cudf_polars.utils.versions import POLARS_VERSION_LT_112, POLARS_VERSION_LT_113
 
 
 @pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
@@ -96,15 +94,7 @@ def test_left_join_with_slice(left, right, join_nulls, zlice):
     q = left.join(right, on="a", how="left", join_nulls=join_nulls, coalesce=True)
     ctx = nullcontext()
     if zlice is not None:
-        q_expect = q.collect().slice(*zlice)
         q = q.slice(*zlice)
-        if POLARS_VERSION_LT_112 and (zlice == (1, 5) or zlice == (0, 2)):
-            # https://github.com/pola-rs/polars/issues/19403
-            # https://github.com/pola-rs/polars/issues/19405
-            ctx = pytest.raises(AssertionError)
-            assert_frame_equal(
-                q_expect, q.collect(engine=pl.GPUEngine(raise_on_fail=True))
-            )
 
     with ctx:
         assert_gpu_result_equal(q)
@@ -125,28 +115,25 @@ def test_cross_join(left, right, zlice):
         (pl.lit(2, dtype=pl.Int64), pl.col("a")),
     ],
 )
-def test_join_literal_key_unsupported(left, right, left_on, right_on):
+def test_join_literal_key(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
-
-    assert_ir_translation_raises(q, NotImplementedError)
+    assert_gpu_result_equal(q)
 
 
 @pytest.mark.parametrize(
     "conditions",
     [
         [pl.col("a") < pl.col("a_right")],
-        [pl.col("a_right") <= pl.col("a") * 2],
+        [
+            pl.col("a_right") <= pl.col("a") * 2,
+            pl.col("a_right") <= 2 * pl.col("a"),
+        ],
         [pl.col("b") * 2 > pl.col("a_right"), pl.col("a") == pl.col("c_right")],
         [pl.col("b") * 2 <= pl.col("a_right"), pl.col("a") < pl.col("c_right")],
-        pytest.param(
-            [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
-            marks=pytest.mark.xfail(
-                POLARS_VERSION_LT_113,
-                reason="https://github.com/pola-rs/polars/issues/19597",
-            ),
-        ),
+        [pl.col("b") <= pl.col("a_right") * 7, pl.col("a") < pl.col("d") * 2],
     ],
 )
+@pytest.mark.parametrize("zlice", [None, (0, 5)])
 def test_join_where(left, right, conditions, zlice):
     q = left.join_where(right, *conditions)
 
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 865b95a7d91..de75900f8c0 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
@@ -6,7 +6,6 @@
 
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
-    assert_ir_translation_raises,
 )
 
 
@@ -22,19 +21,6 @@ def test_union():
     assert_gpu_result_equal(query)
 
 
-def test_union_schema_mismatch_raises():
-    ldf = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
-        }
-    ).lazy()
-    ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
-    query = pl.concat([ldf, ldf2], how="diagonal")
-
-    assert_ir_translation_raises(query, NotImplementedError)
-
-
 def test_concat_vertical():
     ldf = pl.LazyFrame(
         {