From 922ca75dda5202744bbbe3c6fc077e2ce55ba941 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sat, 25 Jan 2025 11:11:11 -0600
Subject: [PATCH 1/5] Add support for `pyarrow-19` (#17794)

This PR upgrades the upper bound pinnings for `pyarrow` in `cudf`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17794
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 dependencies.yaml                                | 6 +++---
 python/cudf/cudf/tests/test_parquet.py           | 4 ++++
 python/cudf/pyproject.toml                       | 4 ++--
 python/pylibcudf/pyproject.toml                  | 6 +++---
 6 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 60d8e96d932..dbb44890965 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -70,7 +70,7 @@ dependencies:
 - polars>=1.11,<1.18
 - pre-commit
 - ptxcompiler
-- pyarrow>=14.0.0,<19.0.0a0
+- pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
 - pynvml>=12.0.0,<13.0.0a0
 - pytest-benchmark
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index fe1a32ccb87..1b674596a4b 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -67,7 +67,7 @@ dependencies:
 - pandoc
 - polars>=1.11,<1.18
 - pre-commit
-- pyarrow>=14.0.0,<19.0.0a0
+- pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
 - pynvjitlink>=0.0.0a0
 - pynvml>=12.0.0,<13.0.0a0
diff --git a/dependencies.yaml b/dependencies.yaml
index edd83e6e07d..54da3d98d09 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -483,7 +483,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - pyarrow>=14.0.0,<19.0.0a0
+          - pyarrow>=14.0.0,<20.0.0a0
       - output_types: [requirements, pyproject]
         packages:
           # pyarrow 17.0.0 wheels have a subtle issue around threading that
@@ -491,8 +491,8 @@ dependencies:
           # be highly dependent on the exact build configuration, so we'll just
           # avoid 17.0.0 for now unless we observe similar issues in future
           # releases as well.
-          - pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'
-          - pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'
+          - pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'
+          - pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9d5f32c7ab9..9ff2a6f0ed7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -4373,6 +4373,10 @@ def test_parquet_reader_mismatched_nullability_structs(tmpdir):
     )
 
 
+@pytest.mark.skipif(
+    pa.__version__ == "19.0.0",
+    reason="https://github.com/rapidsai/cudf/issues/17806",
+)
 @pytest.mark.parametrize(
     "stats_fname,bloom_filter_fname",
     [
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 2b03f515657..bd2a710e84a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,8 +31,8 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
     "pylibcudf==25.2.*,>=0.0.0a0",
     "rich",
     "rmm==25.2.*,>=0.0.0a0",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e0055d5ebf8..efa3d301334 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+# Copyright (c) 2021-2025, NVIDIA CORPORATION.
 
 [build-system]
 build-backend = "rapids_build_backend.build"
@@ -22,8 +22,8 @@ dependencies = [
     "libcudf==25.2.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=14.0.0,<19.0.0a0,!=17.0.0; platform_machine=='aarch64'",
-    "pyarrow>=14.0.0,<19.0.0a0; platform_machine=='x86_64'",
+    "pyarrow>=14.0.0,<20.0.0a0,!=17.0.0; platform_machine=='aarch64'",
+    "pyarrow>=14.0.0,<20.0.0a0; platform_machine=='x86_64'",
     "rmm==25.2.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 133e0c869531af94474e0bbb66cb22c5f8ba80f2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sat, 25 Jan 2025 11:39:46 -0600
Subject: [PATCH 2/5] Resolve race-condition in `disable_module_accelerator`
 (#17811)

Fixes: #17775

This PR fixes a race condition that arises when `disable_module_accelerator` is used in a multi-threaded setting.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17811
---
 python/cudf/cudf/pandas/module_accelerator.py | 21 ++++----
 .../data/disable_cudf_pandas_multi_thread.py  | 49 +++++++++++++++++++
 .../test_disable_pandas_accelerator.py        | 35 +++++++++++++
 python/cudf/cudf_pandas_tests/test_main.py    |  4 +-
 .../cudf/cudf_pandas_tests/test_profiler.py   |  4 +-
 5 files changed, 100 insertions(+), 13 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
 create mode 100644 python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index 38103a71908..9e549713f7b 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -503,17 +503,20 @@ def disabled(self):
         -------
         Context manager for disabling things
         """
-        try:
-            self._use_fast_lib_lock.acquire()
-            # The same thread might enter this context manager
-            # multiple times, so we need to remember the previous
-            # value
+        with self._use_fast_lib_lock:
+            # Have to hold the lock to modify this variable since
+            # another thread might be reading it.
+            # Modification has to happen with the lock held for the
+            # duration, so if someone else has modified things, then
+            # we block trying to acquire the lock (hence it is safe to
+            # release the lock after modifying this value)
             saved = self._use_fast_lib
             self._use_fast_lib = False
+        try:
             yield
         finally:
-            self._use_fast_lib = saved
-            self._use_fast_lib_lock.release()
+            with self._use_fast_lib_lock:
+                self._use_fast_lib = saved
 
     @staticmethod
     def getattr_real_or_wrapped(
@@ -613,7 +616,7 @@ def disable_module_accelerator() -> contextlib.ExitStack:
     """
     Temporarily disable any module acceleration.
     """
-    with contextlib.ExitStack() as stack:
+    with ImportLock(), contextlib.ExitStack() as stack:
         for finder in sys.meta_path:
             if isinstance(finder, ModuleAcceleratorBase):
                 stack.enter_context(finder.disabled())
diff --git a/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py b/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
new file mode 100644
index 00000000000..2cc6cc1ef5b
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/disable_cudf_pandas_multi_thread.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import queue
+from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait
+
+
+# Function to be called in each thread
+def thread_function(index):
+    # Import the library in the thread
+    import pandas as pd
+
+    x = pd.Series([1, 2, 3])
+
+    return f"{index}" + str(type(type(x)))
+
+
+def main():
+    # Number of threads to use
+    num_threads = 4
+
+    # Queue of tasks to be processed by the threads
+    task_queue = queue.Queue()
+    for i in range(num_threads):
+        task_queue.put((i,))
+
+    # List to hold the futures
+    futures = []
+
+    # Use ThreadPoolExecutor to manage the threads
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        while not task_queue.empty():
+            task = task_queue.get()
+            future = executor.submit(thread_function, *task)
+            futures.append(future)
+
+    # Wait for all threads to complete
+    _, _ = wait(futures, return_when=ALL_COMPLETED)
+
+    # Process the results
+    for i, future in enumerate(futures):
+        result = future.result()
+        print(f"Result from thread {i + 1}: {result}")
+
+
+if __name__ == "__main__":
+    from cudf.pandas.module_accelerator import disable_module_accelerator
+
+    with disable_module_accelerator():
+        main()
diff --git a/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
new file mode 100644
index 00000000000..c7af6cc5ebf
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_disable_pandas_accelerator.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import os
+import subprocess
+
+from cudf.testing import _utils as utils
+
+
+def test_disable_pandas_accelerator_multi_threaded():
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+
+    with utils.cudf_timeout(10):
+        sp_completed = subprocess.run(
+            [
+                "python",
+                "-m",
+                "cudf.pandas",
+                data_directory + "/data/disable_cudf_pandas_multi_thread.py",
+            ],
+            capture_output=True,
+            text=True,
+            env=env,
+        )
+    assert sp_completed.returncode == 0
+    output = sp_completed.stdout
+
+    for string in [
+        "Result from thread 1: 0<class 'type'>",
+        "Result from thread 2: 1<class 'type'>",
+        "Result from thread 3: 2<class 'type'>",
+        "Result from thread 4: 3<class 'type'>",
+    ]:
+        assert string in output
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
index 326224c8fc0..9db49d55c90 100644
--- a/python/cudf/cudf_pandas_tests/test_main.py
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -57,7 +57,7 @@ def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
     expect = _run_python(cudf_pandas=False, command=input_args_and_code)
 
     assert "cudf" in res.stdout
-    assert "cudf" not in expect.stdout
+    assert "<module 'pandas' from" in expect.stdout
 
 
 def test_cudf_pandas_script_repl():
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index a5c29bd93a2..f4426e09ef8 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -59,7 +59,7 @@ def test_profiler():
         "pd.DataFrame",
         "",
         "rng.integers",
-        "np.random.rand",
+        "rng.random",
         'df.groupby("idx").sum',
         'df.sum()["data"]',
         "np.isclose",

From 551e452648564e92cef17e2c6047a3471a5cf4ec Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Sun, 26 Jan 2025 10:31:00 -0800
Subject: [PATCH 3/5] Make more constexpr available on device for cuIO (#17746)

Contributes to https://github.com/rapidsai/cudf/issues/7795

This PR addressed most of the relaxed constexpr in cuIO.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/17746
---
 .../cudf/detail/utilities/integer_utils.hpp   | 18 ++---
 cpp/include/cudf/fixed_point/temporary.hpp    |  6 +-
 .../cudf/io/text/detail/multistate.hpp        | 16 +++--
 .../strings/detail/convert/fixed_point.cuh    |  8 ++-
 .../detail/convert/fixed_point_to_string.cuh  |  8 ++-
 .../strings/detail/convert/int_to_string.cuh  |  4 +-
 .../cudf/strings/detail/strings_children.cuh  | 24 +++++--
 cpp/src/io/csv/datetime.cuh                   |  4 +-
 cpp/src/io/json/write_json.cu                 | 70 ++++++++++++++-----
 cpp/src/io/orc/orc.hpp                        |  4 +-
 cpp/src/io/orc/stats_enc.cu                   |  4 +-
 cpp/src/io/orc/stripe_enc.cu                  |  7 +-
 cpp/src/io/orc/stripe_init.cu                 |  5 +-
 cpp/src/io/orc/writer_impl.cu                 |  2 +-
 .../io/parquet/compact_protocol_reader.cpp    |  4 +-
 cpp/src/io/parquet/decode_preprocess.cu       |  4 +-
 cpp/src/io/parquet/delta_binary.cuh           |  6 +-
 cpp/src/io/parquet/delta_enc.cuh              | 12 ++--
 cpp/src/io/parquet/page_decode.cuh            |  5 +-
 cpp/src/io/parquet/page_enc.cu                | 66 ++++++++---------
 cpp/src/io/parquet/parquet.hpp                | 26 +++----
 cpp/src/io/parquet/parquet_gpu.hpp            | 42 +++++++----
 cpp/src/io/parquet/reader_impl_chunking.cu    |  4 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  1 +
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  2 +-
 cpp/src/io/parquet/rle_stream.cuh             |  4 +-
 cpp/src/io/text/multibyte_split.cu            | 14 ++--
 cpp/src/io/utilities/data_casting.cu          |  4 +-
 cpp/src/io/utilities/output_builder.cuh       |  4 +-
 cpp/src/io/utilities/parsing_utils.cuh        | 29 ++++----
 30 files changed, 245 insertions(+), 162 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp
index 44a86f1c84f..135f645817e 100644
--- a/cpp/include/cudf/detail/utilities/integer_utils.hpp
+++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp
@@ -73,7 +73,7 @@ CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus)
  * `modulus` is positive and does not check for overflow.
  */
 template <typename S>
-constexpr S round_down_safe(S number_to_round, S modulus) noexcept
+CUDF_HOST_DEVICE constexpr S round_down_safe(S number_to_round, S modulus) noexcept
 {
   auto remainder    = number_to_round % modulus;
   auto rounded_down = number_to_round - remainder;
@@ -113,16 +113,16 @@ CUDF_HOST_DEVICE constexpr S round_up_unsafe(S number_to_round, S modulus) noexc
  * the result will be incorrect
  */
 template <typename S, typename T>
-constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
+CUDF_HOST_DEVICE constexpr S div_rounding_up_unsafe(S const& dividend, T const& divisor) noexcept
 {
   return (dividend + divisor - 1) / divisor;
 }
 
 namespace detail {
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
-                                 I dividend,
-                                 I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, false>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   // TODO: This could probably be implemented faster
   return (dividend > divisor) ? 1 + div_rounding_up_unsafe(dividend - divisor, divisor)
@@ -130,7 +130,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, false>,
 }
 
 template <typename I>
-constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(cuda::std::integral_constant<bool, true>,
+                                                  I dividend,
+                                                  I divisor) noexcept
 {
   auto quotient  = dividend / divisor;
   auto remainder = dividend % divisor;
@@ -156,9 +158,9 @@ constexpr I div_rounding_up_safe(std::integral_constant<bool, true>, I dividend,
  * the non-integral division `dividend/divisor`
  */
 template <typename I>
-constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
+CUDF_HOST_DEVICE constexpr I div_rounding_up_safe(I dividend, I divisor) noexcept
 {
-  using i_is_a_signed_type = std::integral_constant<bool, std::is_signed_v<I>>;
+  using i_is_a_signed_type = cuda::std::integral_constant<bool, cuda::std::is_signed_v<I>>;
   return detail::div_rounding_up_safe(i_is_a_signed_type{}, dividend, divisor);
 }
 
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 2bafe235058..643d1d07cb7 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,7 @@ auto to_string(T value) -> std::string
 }
 
 template <typename T>
-constexpr auto abs(T value)
+CUDF_HOST_DEVICE constexpr auto abs(T value)
 {
   return value >= 0 ? value : -value;
 }
@@ -72,7 +72,7 @@ CUDF_HOST_DEVICE inline auto max(T lhs, T rhs)
 }
 
 template <typename BaseType>
-constexpr auto exp10(int32_t exponent)
+CUDF_HOST_DEVICE constexpr auto exp10(int32_t exponent)
 {
   BaseType value = 1;
   while (exponent > 0)
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index 32187b43d34..24b8738d5dd 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/utilities/export.hpp>
 
+#include <cuda/functional>
+
 #include <cstdint>
 
 namespace CUDF_EXPORT cudf {
@@ -45,7 +47,7 @@ struct multistate {
    *
    * @note: The behavior of this function is undefined if size() => max_segment_count
    */
-  constexpr void enqueue(uint8_t head, uint8_t tail)
+  CUDF_HOST_DEVICE constexpr void enqueue(uint8_t head, uint8_t tail)
   {
     _heads |= (head & 0xFu) << (_size * 4);
     _tails |= (tail & 0xFu) << (_size * 4);
@@ -55,17 +57,17 @@ struct multistate {
   /**
    * @brief get's the number of segments this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t size() const { return _size; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t size() const { return _size; }
 
   /**
    * @brief get's the highest (____, tail] value this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t max_tail() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t max_tail() const
   {
     uint8_t maximum = 0;
 
     for (uint8_t i = 0; i < _size; i++) {
-      maximum = std::max(maximum, get_tail(i));
+      maximum = cuda::std::max(maximum, get_tail(i));
     }
 
     return maximum;
@@ -74,7 +76,7 @@ struct multistate {
   /**
    * @brief get's the Nth (head, ____] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_head(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_head(uint8_t idx) const
   {
     return (_heads >> (idx * 4)) & 0xFu;
   }
@@ -82,7 +84,7 @@ struct multistate {
   /**
    * @brief get's the Nth (____, tail] value state this multistate represents
    */
-  [[nodiscard]] constexpr uint8_t get_tail(uint8_t idx) const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint8_t get_tail(uint8_t idx) const
   {
     return (_tails >> (idx * 4)) & 0xFu;
   }
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 8440805960e..5ae4af411b6 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #include <cudf/fixed_point/temporary.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
 #include <cuda/std/type_traits>
 #include <thrust/pair.h>
@@ -46,7 +47,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
   // highest value where another decimal digit cannot be appended without an overflow;
   // this preserves the most digits when scaling the final result for this type
   constexpr UnsignedDecimalType decimal_max =
-    (std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
+    (cuda::std::numeric_limits<UnsignedDecimalType>::max() - 9L) / 10L;
 
   __uint128_t value  = 0;  // for checking overflow
   int32_t exp_offset = 0;
@@ -90,7 +91,8 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
 template <bool check_only = false>
 __device__ cuda::std::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
 {
-  constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
+  constexpr uint32_t exponent_max =
+    static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max());
 
   // get optional exponent sign
   int32_t const exp_sign = [&iter] {
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
index 0ee26ec9ee2..af4a4ce7cd2 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 
+#include <cuda/std/functional>
+
 namespace cudf::strings::detail {
 
 /**
@@ -33,7 +35,7 @@ __device__ inline int32_t fixed_point_string_size(__int128_t const& value, int32
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
   auto const fraction  = count_digits(abs_value % exp_ten);
-  auto const num_zeros = std::max(0, (-scale - fraction));
+  auto const num_zeros = cuda::std::max(0, (-scale - fraction));
   return static_cast<int32_t>(value < 0) +    // sign if negative
          count_digits(abs_value / exp_ten) +  // integer
          1 +                                  // decimal point
@@ -66,7 +68,7 @@ __device__ inline void fixed_point_to_string(__int128_t const& value, int32_t sc
   if (value < 0) *out_ptr++ = '-';  // add sign
   auto const abs_value = numeric::detail::abs(value);
   auto const exp_ten   = numeric::detail::exp10<__int128_t>(-scale);
-  auto const num_zeros = std::max(0, (-scale - count_digits(abs_value % exp_ten)));
+  auto const num_zeros = cuda::std::max(0, (-scale - count_digits(abs_value % exp_ten)));
 
   out_ptr += integer_to_string(abs_value / exp_ten, out_ptr);  // add the integer part
   *out_ptr++ = '.';                                            // add decimal point
diff --git a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
index f6e6a10a864..39b9cd6978c 100644
--- a/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
+++ b/cpp/include/cudf/strings/detail/convert/int_to_string.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ __device__ inline size_type integer_to_string(IntegerType value, char* d_buffer)
  * @return size_type number of digits in input value
  */
 template <typename IntegerType>
-constexpr size_type count_digits(IntegerType value)
+__device__ constexpr size_type count_digits(IntegerType value)
 {
   if (value == 0) return 1;
   bool const is_negative = cuda::std::is_signed<IntegerType>() ? (value < 0) : false;
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index de2f1770e28..cf19baf4826 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,6 +41,21 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+template <typename Iter>
+struct string_offsets_fn {
+  Iter _begin;
+  size_type _strings_count;
+  constexpr string_offsets_fn(Iter begin, size_type strings_count)
+    : _begin{begin}, _strings_count{strings_count}
+  {
+  }
+
+  __device__ constexpr size_type operator()(size_type idx) const noexcept
+  {
+    return idx < _strings_count ? static_cast<size_type>(_begin[idx]) : size_type{0};
+  };
+};
+
 /**
  * @brief Gather characters to create a strings column using the given string-index pair iterator
  *
@@ -133,11 +148,8 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   // using exclusive-scan technically requires strings_count+1 input values even though
   // the final input value is never used.
   // The input iterator is wrapped here to allow the 'last value' to be safely read.
-  auto map_fn = cuda::proclaim_return_type<size_type>(
-    [begin, strings_count] __device__(size_type idx) -> size_type {
-      return idx < strings_count ? static_cast<size_type>(begin[idx]) : size_type{0};
-    });
-  auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
+  auto input_itr =
+    cudf::detail::make_counting_transform_iterator(0, string_offsets_fn{begin, strings_count});
   // Use the sizes-to-offsets iterator to compute the total number of elements
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
diff --git a/cpp/src/io/csv/datetime.cuh b/cpp/src/io/csv/datetime.cuh
index bfdba238a1e..0463eca65e9 100644
--- a/cpp/src/io/csv/datetime.cuh
+++ b/cpp/src/io/csv/datetime.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -197,7 +197,7 @@ __inline__ __device__ cuda::std::chrono::hh_mm_ss<duration_ms> extract_time_of_d
 /**
  * @brief Checks whether `c` is decimal digit
  */
-constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
+__device__ constexpr bool is_digit(char c) { return c >= '0' and c <= '9'; }
 
 /**
  * @brief Parses a datetime string and computes the corresponding timestamp.
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1a0c59e365a..1587c4da9c8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -376,6 +376,48 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
     {});
 }
 
+struct scatter_fn {
+  column_device_view _col;
+  size_type* _d_strview_offsets;
+  string_view* _d_strviews;
+  size_type const* _labels;
+  size_type const* _list_offsets;
+  column_device_view _d_strings_children;
+  string_view _element_seperator;
+  string_view _element_narep;
+
+  scatter_fn(column_device_view col,
+             size_type* d_strview_offsets,
+             string_view* d_strviews,
+             size_type const* labels,
+             size_type const* list_offsets,
+             column_device_view d_strings_children,
+             string_view const element_separator,
+             string_view const element_narep) noexcept
+    : _col{col},
+      _d_strview_offsets{d_strview_offsets},
+      _d_strviews{d_strviews},
+      _labels{labels},
+      _list_offsets{list_offsets},
+      _d_strings_children{d_strings_children},
+      _element_seperator{element_separator},
+      _element_narep{element_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const label         = _labels[idx];
+    auto const sublist_index = idx - _list_offsets[label];
+    auto const strview_index = _d_strview_offsets[label] + sublist_index * 2 + 1;
+    // value or na_rep
+    auto const strview         = _d_strings_children.element<cudf::string_view>(idx);
+    _d_strviews[strview_index] = _d_strings_children.is_null(idx) ? _element_narep : strview;
+    // separator
+    if (sublist_index != 0) { _d_strviews[strview_index - 1] = _element_seperator; }
+  }
+};
+
 /**
  * @brief Concatenates a list of strings columns into a single strings column.
  *
@@ -461,24 +503,14 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
   thrust::for_each(rmm::exec_policy_nosync(stream),
                    thrust::make_counting_iterator<size_type>(0),
                    thrust::make_counting_iterator<size_type>(num_strings),
-                   [col                = *col_device_view,
-                    d_strview_offsets  = d_strview_offsets.begin(),
-                    d_strviews         = d_strviews.begin(),
-                    labels             = labels->view().begin<size_type>(),
-                    list_offsets       = offsets.begin<size_type>(),
-                    d_strings_children = *d_strings_children,
-                    element_separator,
-                    element_narep] __device__(auto idx) {
-                     auto const label         = labels[idx];
-                     auto const sublist_index = idx - list_offsets[label];
-                     auto const strview_index = d_strview_offsets[label] + sublist_index * 2 + 1;
-                     // value or na_rep
-                     auto const strview = d_strings_children.element<cudf::string_view>(idx);
-                     d_strviews[strview_index] =
-                       d_strings_children.is_null(idx) ? element_narep : strview;
-                     // separator
-                     if (sublist_index != 0) { d_strviews[strview_index - 1] = element_separator; }
-                   });
+                   scatter_fn{*col_device_view,
+                              d_strview_offsets.data(),
+                              d_strviews.data(),
+                              labels->view().data<size_type>(),
+                              offsets.data<size_type>(),
+                              *d_strings_children,
+                              element_separator,
+                              element_narep});
 
   auto joined_col = make_strings_column(d_strviews, string_view{nullptr, 0}, stream, mr);
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 5ab36fdae8e..8dccf65ef10 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -707,7 +707,7 @@ struct orc_column_device_view : public column_device_view {
 struct rowgroup_rows {
   size_type begin;
   size_type end;
-  [[nodiscard]] constexpr auto size() const noexcept { return end - begin; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return end - begin; }
 };
 
 }  // namespace orc
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index e01b93262d7..5f4c1e0696d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -22,6 +22,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/utility>
+
 namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
@@ -212,7 +214,7 @@ __device__ inline uint8_t* pb_put_fixed64(uint8_t* p, uint32_t id, void const* r
 }
 
 // Splits a nanosecond timestamp into milliseconds and nanoseconds
-__device__ std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
+__device__ cuda::std::pair<int64_t, int32_t> split_nanosecond_timestamp(int64_t nano_count)
 {
   auto const ns           = cuda::std::chrono::nanoseconds(nano_count);
   auto const ms_floor     = cuda::std::chrono::floor<cuda::std::chrono::milliseconds>(ns);
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index bcdd059bf67..857daeb5856 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
 #include <thrust/for_each.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
@@ -413,8 +414,8 @@ static __device__ uint32_t IntegerRLE(
     // Find minimum and maximum values
     if (literal_run > 0) {
       // Find min & max
-      T vmin = (t < literal_run) ? v0 : std::numeric_limits<T>::max();
-      T vmax = (t < literal_run) ? v0 : std::numeric_limits<T>::min();
+      T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::max();
+      T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
       vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
       __syncthreads();
@@ -448,7 +449,7 @@ static __device__ uint32_t IntegerRLE(
         } else {
           uint32_t range, w;
           // Mode 2 base value cannot be bigger than max int64_t, i.e. the first bit has to be 0
-          if (vmin <= std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
+          if (vmin <= cuda::std::numeric_limits<int64_t>::max() and mode1_w > mode2_w and
               (literal_run - 1) * (mode1_w - mode2_w) > 4) {
             s->u.intrle.literal_mode = 2;
             w                        = mode2_w;
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index 0c739f59b0a..5e23bc5adcc 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/array>
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 
@@ -243,9 +244,9 @@ enum row_entry_state_e {
  */
 static auto __device__ index_order_from_index_types(uint32_t index_types_bitmap)
 {
-  constexpr std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
+  constexpr cuda::std::array full_order = {CI_PRESENT, CI_DATA, CI_DATA2};
 
-  std::array<uint32_t, full_order.size()> partial_order;
+  cuda::std::array<uint32_t, full_order.size()> partial_order;
   thrust::copy_if(thrust::seq,
                   full_order.cbegin(),
                   full_order.cend(),
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a0cd126cff0..5c3377a1aeb 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -71,7 +71,7 @@
 namespace cudf::io::orc::detail {
 
 template <typename T>
-[[nodiscard]] constexpr int varint_size(T val)
+[[nodiscard]] CUDF_HOST_DEVICE constexpr int varint_size(T val)
 {
   auto len = 1u;
   while (val > 0x7f) {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index b8e72aaac88..023402cbcf6 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -359,10 +359,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  std::optional<T>& val;  // union structs are always wrapped in std::optional
+  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, std::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index 5b9831668e6..2f402e3c4b8 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ __device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t)
     delta_binary_decoder string_lengths;
     auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end);
     // distance is size of string data
-    return static_cast<size_type>(std::distance(string_start, s->data_end));
+    return static_cast<size_type>(thrust::distance(string_start, s->data_end));
   }
   return 0;
 }
diff --git a/cpp/src/io/parquet/delta_binary.cuh b/cpp/src/io/parquet/delta_binary.cuh
index 1fa05b3a6c2..339a6233c4d 100644
--- a/cpp/src/io/parquet/delta_binary.cuh
+++ b/cpp/src/io/parquet/delta_binary.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -105,7 +105,7 @@ struct delta_binary_decoder {
 
   // returns the value stored in the `value` array at index
   // `rolling_index<delta_rolling_buf_size>(idx)`. If `idx` is `0`, then return `first_value`.
-  constexpr zigzag128_t value_at(size_type idx)
+  __device__ constexpr zigzag128_t value_at(size_type idx)
   {
     return idx == 0 ? first_value : value[rolling_index<delta_rolling_buf_size>(idx)];
   }
@@ -113,7 +113,7 @@ struct delta_binary_decoder {
   // returns the number of values encoded in the block data. when all_values is true,
   // account for the first value in the header. otherwise just count the values encoded
   // in the mini-block data.
-  constexpr uint32_t num_encoded_values(bool all_values)
+  __device__ constexpr uint32_t num_encoded_values(bool all_values)
   {
     return value_count == 0 ? 0 : all_values ? value_count : value_count - 1;
   }
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
index 49f4ccedbf0..56b7c8065ee 100644
--- a/cpp/src/io/parquet/delta_enc.cuh
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
 
 namespace cudf::io::parquet::detail {
 
@@ -57,7 +59,7 @@ constexpr int buffer_size           = 2 * block_size;
 static_assert(block_size % 128 == 0);
 static_assert(values_per_mini_block % 32 == 0);
 
-constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
+__device__ constexpr int rolling_idx(int index) { return rolling_index<buffer_size>(index); }
 
 // Version of bit packer that can handle up to 64 bits values.
 // T is the type to use for processing. if nbits <= 32 use uint32_t, otherwise unsigned long long
@@ -67,8 +69,8 @@ template <typename scratch_type>
 inline __device__ void bitpack_mini_block(
   uint8_t* dst, uleb128_t val, uint32_t count, uint8_t nbits, void* temp_space)
 {
-  using wide_type =
-    std::conditional_t<std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
+  using wide_type = cuda::std::
+    conditional_t<cuda::std::is_same_v<scratch_type, unsigned long long>, __uint128_t, uint64_t>;
   using cudf::detail::warp_size;
   scratch_type constexpr mask = sizeof(scratch_type) * 8 - 1;
   auto constexpr div          = sizeof(scratch_type) * 8;
@@ -235,7 +237,7 @@ class delta_binary_packer {
     size_type const idx = _current_idx + t;
     T const delta       = idx < _num_values ? subtract(_buffer[delta::rolling_idx(idx)],
                                                  _buffer[delta::rolling_idx(idx - 1)])
-                                            : std::numeric_limits<T>::max();
+                                            : cuda::std::numeric_limits<T>::max();
 
     // Find min delta for the block.
     auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min());
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a5023e23cc5..b101733d35e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -28,7 +28,7 @@
 namespace cudf::io::parquet::detail {
 
 struct page_state_s {
-  constexpr page_state_s() noexcept {}
+  CUDF_HOST_DEVICE constexpr page_state_s() noexcept {}
   uint8_t const* data_start{};
   uint8_t const* data_end{};
   uint8_t const* lvl_end{};
@@ -121,7 +121,8 @@ struct null_count_back_copier {
 /**
  * @brief Test if the given page is in a string column
  */
-constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
+__device__ constexpr bool is_string_col(PageInfo const& page,
+                                        device_span<ColumnChunkDesc const> chunks)
 {
   if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 7dc1255af6f..56d638c68eb 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -32,6 +32,9 @@
 #include <cub/cub.cuh>
 #include <cuda/std/chrono>
 #include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <cuda/std/tuple>
+#include <cuda/std/utility>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/gather.h>
@@ -59,7 +62,7 @@ constexpr int encode_block_size = 128;
 constexpr int rle_buffer_size   = 2 * encode_block_size;
 constexpr int num_encode_warps  = encode_block_size / cudf::detail::warp_size;
 
-constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
+__device__ constexpr int rolling_idx(int pos) { return rolling_index<rle_buffer_size>(pos); }
 
 // max V1 header size
 // also valid for dict page header (V1 or V2)
@@ -113,7 +116,7 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
+__device__ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
   if (physical_type == FIXED_LEN_BYTE_ARRAY) {
     return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
@@ -127,7 +130,7 @@ constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_le
   }
 }
 
-constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
+__device__ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
 {
   if (value_bit_width == 0) return 0;
 
@@ -145,7 +148,7 @@ constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_value
 }
 
 // subtract b from a, but return 0 if this would underflow
-constexpr size_t underflow_safe_subtract(size_t a, size_t b)
+__device__ constexpr size_t underflow_safe_subtract(size_t a, size_t b)
 {
   if (b > a) { return 0; }
   return a - b;
@@ -228,7 +231,8 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   __syncthreads();
   // page fragment size must fit in a 32-bit signed integer
-  if (s->frag.fragment_data_size > static_cast<uint32_t>(std::numeric_limits<int32_t>::max())) {
+  if (s->frag.fragment_data_size >
+      static_cast<uint32_t>(cuda::std::numeric_limits<int32_t>::max())) {
     // TODO need to propagate this error back to the host
     CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
   }
@@ -357,7 +361,7 @@ struct BitwiseOr {
 template <Type PT, typename I>
 __device__ uint8_t const* delta_encode(page_enc_state_s<0>* s, uint64_t* buffer, void* temp_space)
 {
-  using output_type = std::conditional_t<PT == INT32, int32_t, int64_t>;
+  using output_type = cuda::std::conditional_t<PT == INT32, int32_t, int64_t>;
   __shared__ delta_binary_packer<output_type> packer;
 
   auto const t = threadIdx.x;
@@ -737,7 +741,7 @@ CUDF_KERNEL void __launch_bounds__(128)
           : frag_g.fragment_data_size;
 
       // page fragment size must fit in a 32-bit signed integer
-      if (fragment_data_size > std::numeric_limits<int32_t>::max()) {
+      if (fragment_data_size > cuda::std::numeric_limits<int32_t>::max()) {
         CUDF_UNREACHABLE("page fragment size exceeds maximum for i32");
       }
 
@@ -816,7 +820,7 @@ CUDF_KERNEL void __launch_bounds__(128)
             page_size + rle_pad +
             (write_v2_headers ? page_g.max_lvl_size : def_level_size + rep_level_size);
           // page size must fit in 32-bit signed integer
-          if (max_data_size > std::numeric_limits<int32_t>::max()) {
+          if (max_data_size > cuda::std::numeric_limits<int32_t>::max()) {
             CUDF_UNREACHABLE("page size exceeds maximum for i32");
           }
           // if byte_array then save the variable bytes size
@@ -1321,7 +1325,7 @@ static __device__ void PlainBoolEncode(rle_page_enc_state_s* s,
  * @return The difference between two epochs in `cuda::std::chrono::duration` format with a period
  * of hours.
  */
-constexpr auto julian_calendar_epoch_diff()
+__device__ constexpr auto julian_calendar_epoch_diff()
 {
   using namespace cuda::std::chrono;
   using namespace cuda::std::chrono_literals;
@@ -1346,7 +1350,7 @@ __device__ auto julian_days_with_time(int64_t v)
   auto const dur_time_of_day       = dur_total - dur_days;
   auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
   auto const julian_days           = dur_days + ceil<days>(julian_calendar_epoch_diff());
-  return std::make_pair(dur_time_of_day_nanos, julian_days);
+  return cuda::std::pair{dur_time_of_day_nanos, julian_days};
 }
 
 // this has been split out into its own kernel because of the amount of shared memory required
@@ -1711,7 +1715,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                      : 0;
         val_idx  = val_idx_in_leaf_col;
       }
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::make_tuple(is_valid, val_idx);
     }();
 
     cur_val_idx += nvals;
@@ -1950,7 +1954,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       // need to test for use_dictionary because it might be boolean
       uint32_t const val_idx =
         (s->ck.use_dictionary) ? val_idx_in_leaf_col - s->chunk_start_val : val_idx_in_leaf_col;
-      return std::make_tuple(is_valid, val_idx);
+      return cuda::std::tuple{is_valid, val_idx};
     }();
 
     cur_val_idx += nvals;
@@ -2200,7 +2204,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
         auto const arr_size =
           get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx).size_bytes();
         // the lengths are assumed to be INT32, check for overflow
-        if (arr_size > static_cast<size_t>(std::numeric_limits<int32_t>::max())) {
+        if (arr_size > static_cast<size_t>(cuda::std::numeric_limits<int32_t>::max())) {
           CUDF_UNREACHABLE("byte array size exceeds 2GB");
         }
         v = static_cast<int32_t>(arr_size);
@@ -2641,7 +2645,7 @@ class header_encoder {
       cpw_put_fldh(current_header_ptr, field, current_field_index, FieldType::LIST);
     auto const t_num   = static_cast<uint8_t>(type);
     current_header_ptr = cpw_put_uint8(
-      current_header_ptr, static_cast<uint8_t>((std::min(len, size_t{0xfu}) << 4) | t_num));
+      current_header_ptr, static_cast<uint8_t>((cuda::std::min(len, size_t{0xfu}) << 4) | t_num));
     if (len >= 0xf) { current_header_ptr = cpw_put_uint32(current_header_ptr, len); }
     current_field_index = 0;
   }
@@ -2802,10 +2806,8 @@ __device__ bool increment_utf8_at(unsigned char* ptr)
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned char const> span,
-                                                          bool is_min,
-                                                          void* scratch,
-                                                          int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_utf8(
+  device_span<unsigned char const> span, bool is_min, void* scratch, int32_t truncate_length)
 {
   // we know at this point that truncate_length < size_bytes, so
   // there is data at [len]. work backwards until we find
@@ -2842,10 +2844,10 @@ __device__ std::pair<void const*, uint32_t> truncate_utf8(device_span<unsigned c
  *
  * @return Pair object containing a pointer to the truncated data and its length.
  */
-__device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t const> arr,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (is_min) { return {arr.data(), truncate_length}; }
   memcpy(scratch, arr.data(), truncate_length);
@@ -2869,10 +2871,10 @@ __device__ std::pair<void const*, uint32_t> truncate_binary(device_span<uint8_t
 /**
  * @brief Attempt to truncate a UTF-8 string to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_string(string_view const& str,
-                                                            bool is_min,
-                                                            void* scratch,
-                                                            int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> truncate_string(string_view const& str,
+                                                                  bool is_min,
+                                                                  void* scratch,
+                                                                  int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or str.size_bytes() <= truncate_length) {
     return {str.data(), str.size_bytes()};
@@ -2893,7 +2895,7 @@ __device__ std::pair<void const*, uint32_t> truncate_string(string_view const& s
 /**
  * @brief Attempt to truncate a binary array to at most truncate_length bytes.
  */
-__device__ std::pair<void const*, uint32_t> truncate_byte_array(
+__device__ cuda::std::pair<void const*, uint32_t> truncate_byte_array(
   statistics::byte_array_view const& arr, bool is_min, void* scratch, int32_t truncate_length)
 {
   if (truncate_length == NO_TRUNC_STATS or arr.size_bytes() <= truncate_length) {
@@ -2914,11 +2916,11 @@ __device__ std::pair<void const*, uint32_t> truncate_byte_array(
  * valid min or max binary value.  String and byte array types will be truncated if they exceed
  * truncate_length.
  */
-__device__ std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
-                                                         statistics_dtype dtype,
-                                                         void* scratch,
-                                                         bool is_min,
-                                                         int32_t truncate_length)
+__device__ cuda::std::pair<void const*, uint32_t> get_extremum(statistics_val const* stats_val,
+                                                               statistics_dtype dtype,
+                                                               void* scratch,
+                                                               bool is_min,
+                                                               int32_t truncate_length)
 {
   switch (dtype) {
     case dtype_bool: return {stats_val, sizeof(bool)};
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index dc0c4b1540e..f7cbe2bd924 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,6 +20,8 @@
 
 #include <cudf/types.hpp>
 
+#include <cuda/std/optional>
+
 #include <cstdint>
 #include <optional>
 #include <string>
@@ -92,10 +94,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  std::optional<DecimalType> decimal_type;
-  std::optional<TimeType> time_type;
-  std::optional<TimestampType> timestamp_type;
-  std::optional<IntType> int_type;
+  cuda::std::optional<DecimalType> decimal_type;
+  cuda::std::optional<TimeType> time_type;
+  cuda::std::optional<TimestampType> timestamp_type;
+  cuda::std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -103,36 +105,36 @@ struct LogicalType {
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
-  [[nodiscard]] constexpr bool is_time_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_millis() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_time_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_micros() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_time_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_time_nanos() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::NANOS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_millis() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_millis() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_micros() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_micros() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
   }
 
-  [[nodiscard]] constexpr bool is_timestamp_nanos() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_timestamp_nanos() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
   }
-  [[nodiscard]] constexpr int8_t bit_width() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int8_t bit_width() const
   {
     return type == INTEGER ? int_type->bitWidth : -1;
   }
@@ -144,7 +146,7 @@ struct LogicalType {
     return type == DECIMAL ? decimal_type->scale : -1;
   }
 
-  [[nodiscard]] constexpr int32_t precision() const
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr int32_t precision() const
   {
     return type == DECIMAL ? decimal_type->precision : -1;
   }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3c8d32572f8..4425f49d82d 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -34,6 +34,7 @@
 #include <cuda/atomic>
 #include <cuda_runtime.h>
 
+#include <optional>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -52,7 +53,7 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 
 template <int rolling_size>
-constexpr int rolling_index(int index)
+CUDF_HOST_DEVICE constexpr int rolling_index(int index)
 {
   // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
   // actual be executed.
@@ -78,7 +79,7 @@ constexpr uint8_t REP_LVL_HIST_CUTOFF = 0;
 constexpr uint8_t DEF_LVL_HIST_CUTOFF = 0;
 
 // see setupLocalPageInfo() in page_decode.cuh for supported page encodings
-constexpr bool is_supported_encoding(Encoding enc)
+CUDF_HOST_DEVICE constexpr bool is_supported_encoding(Encoding enc)
 {
   switch (enc) {
     case Encoding::PLAIN:
@@ -96,7 +97,8 @@ constexpr bool is_supported_encoding(Encoding enc)
 /**
  * @brief Atomically OR `error` into `error_code`.
  */
-constexpr void set_error(kernel_error::value_type error, kernel_error::pointer error_code)
+__device__ constexpr void set_error(kernel_error::value_type error,
+                                    kernel_error::pointer error_code)
 {
   if (error != 0) {
     cuda::atomic_ref<kernel_error::value_type, cuda::thread_scope_device> ref{*error_code};
@@ -162,14 +164,14 @@ using std::is_scoped_enum;
 // helpers to do bit operations on scoped enums
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitAnd(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitAnd(Ts... bits)
 {
   return (... & static_cast<std::uint32_t>(bits));
 }
 
 template <typename... Ts,
           CUDF_ENABLE_IF(... && (std::is_same_v<std::uint32_t, Ts> || is_scoped_enum<Ts>::value))>
-constexpr std::uint32_t BitOr(Ts... bits)
+CUDF_HOST_DEVICE constexpr std::uint32_t BitOr(Ts... bits)
 {
   return (... | static_cast<std::uint32_t>(bits));
 }
@@ -401,7 +403,7 @@ inline auto make_page_key_iterator(device_span<PageInfo const> pages)
  * @brief Struct describing a particular chunk of column data
  */
 struct ColumnChunkDesc {
-  constexpr ColumnChunkDesc() noexcept {};
+  CUDF_HOST_DEVICE constexpr ColumnChunkDesc() noexcept {};
   explicit ColumnChunkDesc(size_t compressed_size_,
                            uint8_t* compressed_data_,
                            size_t num_values_,
@@ -498,8 +500,8 @@ struct parquet_column_device_view : stats_column_desc {
   int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  [[nodiscard]] constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
-  [[nodiscard]] constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
+  [[nodiscard]] __device__ constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  [[nodiscard]] __device__ constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   uint8_t max_def_level;  //!< needed for SizeStatistics calculation
   uint8_t max_rep_level;
 
@@ -540,7 +542,7 @@ constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 struct EncPage;
 
 // convert Encoding to a mask value
-constexpr uint32_t encoding_to_mask(Encoding encoding)
+CUDF_HOST_DEVICE constexpr uint32_t encoding_to_mask(Encoding encoding)
 {
   return 1 << static_cast<uint32_t>(encoding);
 }
@@ -601,9 +603,15 @@ struct EncColumnChunk {
   uint32_t* rep_histogram_data;  //!< Size is (max(level) + 1) * (num_data_pages + 1).
   size_t var_bytes_size;         //!< Sum of var_bytes_size from the pages (byte arrays only)
 
-  [[nodiscard]] constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_dict_pages() const
+  {
+    return use_dictionary ? 1 : 0;
+  }
 
-  [[nodiscard]] constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr uint32_t num_data_pages() const
+  {
+    return num_pages - num_dict_pages();
+  }
 };
 
 /**
@@ -642,15 +650,21 @@ struct EncPage {
   Encoding encoding;       //!< Encoding used for page data
   uint16_t num_fragments;  //!< Number of fragments in page
 
-  [[nodiscard]] constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_v2() const
+  {
+    return page_type == PageType::DATA_PAGE_V2;
+  }
 
-  [[nodiscard]] constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; }
+  [[nodiscard]] CUDF_HOST_DEVICE constexpr auto level_bytes() const
+  {
+    return def_lvl_bytes + rep_lvl_bytes;
+  }
 };
 
 /**
  * @brief Test if the given column chunk is in a string column
  */
-constexpr bool is_string_col(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_col(ColumnChunkDesc const& chunk)
 {
   // return true for non-hashed byte_array and fixed_len_byte_array that isn't representing
   // a decimal.
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 933be889b1a..03a37327e9b 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1079,7 +1079,7 @@ struct decomp_sum {
   {
     return {a.codec,
             a.num_pages + b.num_pages,
-            std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
+            cuda::std::max(a.max_page_decompressed_size, b.max_page_decompressed_size),
             a.total_decompressed_size + b.total_decompressed_size};
   }
 };
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 25baa1e0ec8..7d3b6a39d5b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -30,6 +30,7 @@
 
 #include <functional>
 #include <numeric>
+#include <optional>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 43666f9e42d..3874346e471 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -649,7 +649,7 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
-constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+__device__ constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
 {
   auto const is_decimal =
     chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/rle_stream.cuh b/cpp/src/io/parquet/rle_stream.cuh
index 3c49de0c997..2de2670b7a7 100644
--- a/cpp/src/io/parquet/rle_stream.cuh
+++ b/cpp/src/io/parquet/rle_stream.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 namespace cudf::io::parquet::detail {
 
 template <int num_threads>
-constexpr int rle_stream_required_run_buffer_size()
+__device__ constexpr int rle_stream_required_run_buffer_size()
 {
   constexpr int num_rle_stream_decode_warps = (num_threads / cudf::detail::warp_size) - 1;
   return (num_rle_stream_decode_warps * 2);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 028f922bec3..37b1608463b 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,7 +66,7 @@ int32_t constexpr ITEMS_PER_TILE   = ITEMS_PER_THREAD * THREADS_PER_TILE;
 int32_t constexpr TILES_PER_CHUNK  = 4096;
 int32_t constexpr ITEMS_PER_CHUNK  = ITEMS_PER_TILE * TILES_PER_CHUNK;
 
-constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition_init(char c, cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -79,7 +79,9 @@ constexpr multistate transition_init(char c, cudf::device_span<char const> delim
   return result;
 }
 
-constexpr multistate transition(char c, multistate state, cudf::device_span<char const> delim)
+__device__ constexpr multistate transition(char c,
+                                           multistate state,
+                                           cudf::device_span<char const> delim)
 {
   auto result = multistate();
 
@@ -182,7 +184,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -257,7 +259,7 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   auto const thread_input_offset =
     tile_input_offset + cudf::thread_index_type{threadIdx.x} * ITEMS_PER_THREAD;
   auto const thread_input_size =
-    std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
+    cuda::std::max<cudf::size_type>(chunk_input_chars.size() - thread_input_offset, 0);
 
   // STEP 1: Load inputs
 
@@ -555,7 +557,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
           if (row == last_row && insert_end) {
             return thrust::make_pair(chars + begin, len);
           } else {
-            return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+            return thrust::make_pair(chars + begin, cuda::std::max<size_type>(0, len - delim_size));
           };
         }));
     return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 0c49b2e5d78..2750a17d328 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,7 +145,7 @@ __device__ __forceinline__ int32_t parse_unicode_hex(char const* str)
  * @brief Writes the UTF-8 byte sequence to \p out_it and returns the number of bytes written to
  * \p out_it
  */
-constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
+__device__ constexpr size_type write_utf8_char(char_utf8 character, char*& out_it)
 {
   auto const bytes = (out_it == nullptr) ? strings::detail::bytes_in_char_utf8(character)
                                          : strings::detail::from_char_utf8(character, out_it);
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 8183a66f4f0..46a3880df84 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,7 +59,7 @@ class split_device_span {
   {
   }
 
-  [[nodiscard]] constexpr reference operator[](size_type i) const
+  [[nodiscard]] __device__ constexpr reference operator[](size_type i) const
   {
     return i < _head.size() ? _head[i] : _tail[i - _head.size()];
   }
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 9833dab282e..a30ede957ec 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -30,7 +30,10 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/limits>
 #include <cuda/std/optional>
+#include <cuda/std/type_traits>
+#include <cuda/std/utility>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/mismatch.h>
@@ -158,7 +161,7 @@ __device__ __forceinline__ thrust::pair<char, char> get_escaped_char(char escape
  * @return uint8_t Numeric value of the character, or `0`
  */
 template <typename T, bool as_hex = false>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
+__device__ constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
   if constexpr (as_hex and std::is_integral_v<T>) {
@@ -210,9 +213,9 @@ CUDF_HOST_DEVICE constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-__host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
-                                                         char const* end,
-                                                         parse_options_view const& opts)
+CUDF_HOST_DEVICE cuda::std::optional<T> parse_numeric(char const* begin,
+                                                      char const* end,
+                                                      parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
@@ -222,8 +225,8 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
   int32_t sign = (*begin == '-') ? -1 : 1;
 
   // Handle infinity
-  if (std::is_floating_point_v<T> && is_infinity(begin, end)) {
-    return sign * std::numeric_limits<T>::infinity();
+  if (cuda::std::is_floating_point_v<T> && is_infinity(begin, end)) {
+    return sign * cuda::std::numeric_limits<T>::infinity();
   }
   if (*begin == '-' || *begin == '+') begin++;
 
@@ -244,7 +247,7 @@ __host__ __device__ cuda::std::optional<T> parse_numeric(char const* begin,
     ++begin;
   }
 
-  if (std::is_floating_point_v<T>) {
+  if (cuda::std::is_floating_point_v<T>) {
     // Handle fractional part of the number if necessary
     double divisor = 1;
     while (begin < end) {
@@ -449,7 +452,7 @@ __inline__ __device__ It skip_character(It const& it, char ch)
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quotes(
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces_quotes(
   char const* begin, char const* end, char quotechar = '\0')
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
@@ -471,8 +474,8 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char const* begin,
-                                                                           char const* end)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_whitespaces(char const* begin,
+                                                                                 char const* end)
 {
   auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
 
@@ -495,9 +498,9 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char
  *
  * @return Trimmed range
  */
-__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
-                                                                      char const* end,
-                                                                      char quotechar)
+__inline__ __device__ cuda::std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                            char const* end,
+                                                                            char quotechar)
 {
   if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
        *thrust::prev(end) == quotechar)) {

From 3336f01cc4014e0e7d7b60be5253b17d8e6f8602 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 27 Jan 2025 10:07:31 -0600
Subject: [PATCH 4/5] increase parallelism in nightly builds (#17792)

Contributes to https://github.com/rapidsai/build-planning/issues/136

For nightly builds, some `wheel-build-{project}` jobs currently wait to start until some other `wheel-publish-{dependency}` jobs complete. This is unnecessary... `wheel-build-{dependency}` jobs will upload packages to S3, which is where `wheel-build-{project}` jobs will download them from.

This proposes changing that such that all nightly `wheel-build-*` jobs depend only other `wheel-build-*` jobs. This should decrease the end-to-end time it takes for all wheels to be built and published on nightly / branch builds.

Also updates `pre-commit` config to the latest `rapids-dependency-file-generator` version.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/17792
---
 .github/workflows/build.yaml | 8 ++++----
 .pre-commit-config.yaml      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 65aebfb7f8c..f6b3fb83cdd 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -90,7 +90,7 @@ jobs:
       package-name: libcudf
       package-type: cpp
   wheel-build-pylibcudf:
-    needs: [wheel-publish-libcudf]
+    needs: [wheel-build-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -111,7 +111,7 @@ jobs:
       package-name: pylibcudf
       package-type: python
   wheel-build-cudf:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -132,7 +132,7 @@ jobs:
       package-name: cudf
       package-type: python
   wheel-build-dask-cudf:
-    needs: wheel-publish-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
@@ -155,7 +155,7 @@ jobs:
       package-name: dask_cudf
       package-type: python
   wheel-build-cudf-polars:
-    needs: wheel-publish-pylibcudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-25.02
     with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d99b74506e4..052c6cc2cb9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -173,7 +173,7 @@ repos:
           )
       - id: verify-alpha-spec
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.16.0
+    rev: v1.17.0
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]

From 03e1f64a3678b8b24c0390a781ca99d8c2234c97 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 27 Jan 2025 10:38:22 -0600
Subject: [PATCH 5/5] Fix pre-commit.ci failures (#17819)

## Description
This PR fixes `pre-commit.ci` failures.

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 052c6cc2cb9..965b667605c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
   autoupdate_branch: ""
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: quarterly
-  skip: ["verify-alpha-spec"]
+  skip: ["verify-alpha-spec", "nbqa-isort"]
   submodules: false
 
 repos: